From ef2d19b37cb9978512f3ce8f0a6385cc44244d4d Mon Sep 17 00:00:00 2001 From: Roland Schulz Date: Mon, 20 Jan 2014 16:33:02 -0500 Subject: [PATCH] Replace all mdrun rngs with cycle based rng The stateful random rumber generator (rng) used previously doesn't produce reproducible results in parallel for sd/bd and doesn't produce reproducible results for continutation for replica exchange. The rng state has been removed from the checkpoint file. Fixes #995 Change-Id: Id2a5d064cf363c54db3c16a0675cfeba553feeaa --- src/external/Random123-1.08/LICENSE | 31 + .../Random123-1.08/include/Random123/array.h | 326 +++++++ .../Random123/features/clangfeatures.h | 76 ++ .../Random123/features/compilerfeatures.h | 320 +++++++ .../include/Random123/features/gccfeatures.h | 247 +++++ .../include/Random123/features/iccfeatures.h | 208 +++++ .../include/Random123/features/msvcfeatures.h | 200 ++++ .../include/Random123/features/nvccfeatures.h | 110 +++ .../Random123/features/open64features.h | 50 + .../Random123/features/openclfeatures.h | 89 ++ .../include/Random123/features/pgccfeatures.h | 194 ++++ .../include/Random123/features/sse.h | 280 ++++++ .../Random123/features/sunprofeatures.h | 172 ++++ .../include/Random123/features/xlcfeatures.h | 202 ++++ .../include/Random123/threefry.h | 864 ++++++++++++++++++ src/gromacs/fileio/trajectory_writing.c | 10 - src/gromacs/fileio/trajectory_writing.h | 3 - src/gromacs/gmxlib/checkpoint.c | 78 +- src/gromacs/gmxlib/mvdata.c | 12 - src/gromacs/gmxlib/typedefs.c | 14 - src/gromacs/legacyheaders/checkpoint.h | 5 +- src/gromacs/legacyheaders/mdrun.h | 14 +- src/gromacs/legacyheaders/types/state.h | 11 +- src/gromacs/legacyheaders/update.h | 13 +- src/gromacs/mdlib/coupling.c | 301 ++---- src/gromacs/mdlib/domdec.c | 59 -- src/gromacs/mdlib/domdec_top.c | 24 - src/gromacs/mdlib/expanded.c | 34 +- src/gromacs/mdlib/init.c | 34 +- src/gromacs/mdlib/minimize.c | 1 - src/gromacs/mdlib/sim_util.c | 1 - src/gromacs/mdlib/tpi.c | 394 ++++---- src/gromacs/mdlib/update.c | 211 ++--- src/gromacs/random/random.c | 61 +- src/gromacs/random/random.h | 56 ++ src/gromacs/tools/convert_tpr.c | 8 - src/programs/mdrun/md.c | 26 +- src/programs/mdrun/repl_ex.c | 22 +- src/programs/mdrun/runner.c | 10 +- tests/CppCheck.cmake | 3 +- 40 files changed, 3918 insertions(+), 856 deletions(-) create mode 100644 src/external/Random123-1.08/LICENSE create mode 100644 src/external/Random123-1.08/include/Random123/array.h create mode 100644 src/external/Random123-1.08/include/Random123/features/clangfeatures.h create mode 100644 src/external/Random123-1.08/include/Random123/features/compilerfeatures.h create mode 100644 src/external/Random123-1.08/include/Random123/features/gccfeatures.h create mode 100644 src/external/Random123-1.08/include/Random123/features/iccfeatures.h create mode 100644 src/external/Random123-1.08/include/Random123/features/msvcfeatures.h create mode 100644 src/external/Random123-1.08/include/Random123/features/nvccfeatures.h create mode 100644 src/external/Random123-1.08/include/Random123/features/open64features.h create mode 100644 src/external/Random123-1.08/include/Random123/features/openclfeatures.h create mode 100644 src/external/Random123-1.08/include/Random123/features/pgccfeatures.h create mode 100644 src/external/Random123-1.08/include/Random123/features/sse.h create mode 100644 src/external/Random123-1.08/include/Random123/features/sunprofeatures.h create mode 100644 src/external/Random123-1.08/include/Random123/features/xlcfeatures.h create mode 100644 src/external/Random123-1.08/include/Random123/threefry.h diff --git a/src/external/Random123-1.08/LICENSE b/src/external/Random123-1.08/LICENSE new file mode 100644 index 0000000000..c6094acaf2 --- /dev/null +++ b/src/external/Random123-1.08/LICENSE @@ -0,0 +1,31 @@ +/** @page LICENSE +Copyright 2010-2012, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ diff --git a/src/external/Random123-1.08/include/Random123/array.h b/src/external/Random123-1.08/include/Random123/array.h new file mode 100644 index 0000000000..ab85392d8d --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/array.h @@ -0,0 +1,326 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _r123array_dot_h__ +#define _r123array_dot_h__ +#include "features/compilerfeatures.h" +#include "features/sse.h" + +#ifndef __cplusplus +#define CXXMETHODS(_N, W, T) +#define CXXOVERLOADS(_N, W, T) +#else + +#include +#include +#include +#include +#include +#include + +/** @defgroup arrayNxW The r123arrayNxW classes + + Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers. + It is functionally equivalent to the C++0x std::array, + but does not require C++0x features or libraries. + + In addition to meeting most of the requirements of a Container, + it also has a member function, incr(), which increments the zero-th + element and carrys overflows into higher indexed elements. Thus, + by using incr(), sequences of up to 2^(N*W) distinct values + can be produced. + + If SSE is supported by the compiler, then the class + r123array1xm128i is also defined, in which the data member is an + array of one r123128i object. + + @cond HIDDEN_FROM_DOXYGEN +*/ + +template +inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){ + value_type v=0; + for(size_t i=0; i<(3+sizeof(value_type))/4; ++i) + v |= ((value_type)(*p32++)) << (32*i); + return v; +} + +// Work-alike methods and typedefs modeled on std::array: +#define CXXMETHODS(_N, W, T) \ + typedef T value_type; \ + typedef T* iterator; \ + typedef const T* const_iterator; \ + typedef value_type& reference; \ + typedef const value_type& const_reference; \ + typedef size_t size_type; \ + typedef ptrdiff_t difference_type; \ + typedef T* pointer; \ + typedef const T* const_pointer; \ + typedef std::reverse_iterator reverse_iterator; \ + typedef std::reverse_iterator const_reverse_iterator; \ + /* Boost.array has static_size. C++11 specializes tuple_size */ \ + enum {static_size = _N}; \ + R123_CUDA_DEVICE reference operator[](size_type i){return v[i];} \ + R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];} \ + R123_CUDA_DEVICE reference at(size_type i){ if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \ + R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \ + R123_CUDA_DEVICE size_type size() const { return _N; } \ + R123_CUDA_DEVICE size_type max_size() const { return _N; } \ + R123_CUDA_DEVICE bool empty() const { return _N==0; }; \ + R123_CUDA_DEVICE iterator begin() { return &v[0]; } \ + R123_CUDA_DEVICE iterator end() { return &v[_N]; } \ + R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; } \ + R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; } \ + R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; } \ + R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; } \ + R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); } \ + R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \ + R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); } \ + R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \ + R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \ + R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } \ + R123_CUDA_DEVICE pointer data(){ return &v[0]; } \ + R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; } \ + R123_CUDA_DEVICE reference front(){ return v[0]; } \ + R123_CUDA_DEVICE const_reference front() const{ return v[0]; } \ + R123_CUDA_DEVICE reference back(){ return v[_N-1]; } \ + R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; } \ + R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \ + /* CUDA3 does not have std::equal */ \ + for (size_t i = 0; i < _N; ++i) \ + if (v[i] != rhs.v[i]) return false; \ + return true; \ + } \ + R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \ + /* CUDA3 does not have std::fill_n */ \ + R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \ + R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \ + /* CUDA3 does not have std::swap_ranges */ \ + for (size_t i = 0; i < _N; ++i) { \ + T tmp = v[i]; \ + v[i] = rhs.v[i]; \ + rhs.v[i] = tmp; \ + } \ + } \ + R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){ \ + /* This test is tricky because we're trying to avoid spurious \ + complaints about illegal shifts, yet still be compile-time \ + evaulated. */ \ + if(sizeof(T)>((sizeof(T)3?3:0] is to silence \ + a spurious error from icpc \ + */ \ + ++v[_N>1?1:0]; \ + if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \ + ++v[_N>2?2:0]; \ + if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this; \ + ++v[_N>3?3:0]; \ + for(size_t i=4; i<_N; ++i){ \ + if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this; \ + ++v[i]; \ + } \ + return *this; \ + } \ + /* seed(SeedSeq) would be a constructor if having a constructor */ \ + /* didn't cause headaches with defaults */ \ + template \ + R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){ \ + r123array##_N##x##W ret; \ + const size_t Ngen = _N*((3+sizeof(value_type))/4); \ + uint32_t u32[Ngen]; \ + uint32_t *p32 = &u32[0]; \ + ss.generate(&u32[0], &u32[Ngen]); \ + for(size_t i=0; i<_N; ++i){ \ + ret.v[i] = assemble_from_u32(p32); \ + p32 += (3+sizeof(value_type))/4; \ + } \ + return ret; \ + } \ +protected: \ + R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \ + /* n may be greater than the maximum value of a single value_type */ \ + value_type vtn; \ + vtn = n; \ + v[0] += n; \ + const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \ + for(size_t i=1; i<_N; ++i){ \ + if(rshift){ \ + n >>= rshift; \ + }else{ \ + n=0; \ + } \ + if( v[i-1] < vtn ) \ + ++n; \ + if( n==0 ) break; \ + vtn = n; \ + v[i] += n; \ + } \ + return *this; \ + } \ + + +// There are several tricky considerations for the insertion and extraction +// operators: +// - we would like to be able to print r123array16x8 as a sequence of 16 integers, +// not as 16 bytes. +// - we would like to be able to print r123array1xm128i. +// - we do not want an int conversion operator in r123m128i because it causes +// lots of ambiguity problems with automatic promotions. +// Solution: r123arrayinsertable and r123arrayextractable + +template +struct r123arrayinsertable{ + const T& v; + r123arrayinsertable(const T& t_) : v(t_) {} + friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable& t){ + return os << t.v; + } +}; + +template<> +struct r123arrayinsertable{ + const uint8_t& v; + r123arrayinsertable(const uint8_t& t_) : v(t_) {} + friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable& t){ + return os << (int)t.v; + } +}; + +template +struct r123arrayextractable{ + T& v; + r123arrayextractable(T& t_) : v(t_) {} + friend std::istream& operator>>(std::istream& is, r123arrayextractable& t){ + return is >> t.v; + } +}; + +template<> +struct r123arrayextractable{ + uint8_t& v; + r123arrayextractable(uint8_t& t_) : v(t_) {} + friend std::istream& operator>>(std::istream& is, r123arrayextractable& t){ + int i; + is >> i; + t.v = i; + return is; + } +}; + +#define CXXOVERLOADS(_N, W, T) \ + \ +inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){ \ + os << r123arrayinsertable(a.v[0]); \ + for(size_t i=1; i<_N; ++i) \ + os << " " << r123arrayinsertable(a.v[i]); \ + return os; \ +} \ + \ +inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){ \ + for(size_t i=0; i<_N; ++i){ \ + r123arrayextractable x(a.v[i]); \ + is >> x; \ + } \ + return is; \ +} \ + \ +namespace r123{ \ + typedef r123array##_N##x##W Array##_N##x##W; \ +} + +#endif /* __cplusplus */ + +/* _r123array_tpl expands to a declaration of struct r123arrayNxW. + + In C, it's nothing more than a struct containing an array of N + objects of type T. + + In C++ it's the same, but endowed with an assortment of member + functions, typedefs and friends. In C++, r123arrayNxW looks a lot + like std::array, has most of the capabilities of a container, + and satisfies the requirements outlined in compat/Engine.hpp for + counter and key types. ArrayNxW, in the r123 namespace is + a typedef equivalent to r123arrayNxW. +*/ + +#define _r123array_tpl(_N, W, T) \ + /** @ingroup arrayNxW */ \ + /** @see arrayNxW */ \ +struct r123array##_N##x##W{ \ + T v[_N]; \ + CXXMETHODS(_N, W, T) \ +}; \ + \ +CXXOVERLOADS(_N, W, T) + +/** @endcond */ + +_r123array_tpl(1, 32, uint32_t) /* r123array1x32 */ +_r123array_tpl(2, 32, uint32_t) /* r123array2x32 */ +_r123array_tpl(4, 32, uint32_t) /* r123array4x32 */ +_r123array_tpl(8, 32, uint32_t) /* r123array8x32 */ + +_r123array_tpl(1, 64, uint64_t) /* r123array1x64 */ +_r123array_tpl(2, 64, uint64_t) /* r123array2x64 */ +_r123array_tpl(4, 64, uint64_t) /* r123array4x64 */ + +_r123array_tpl(16, 8, uint8_t) /* r123array16x8 for ARSsw, AESsw */ + +#if R123_USE_SSE +_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */ +#endif + +/* In C++, it's natural to use sizeof(a::value_type), but in C it's + pretty convoluted to figure out the width of the value_type of an + r123arrayNxW: +*/ +#define R123_W(a) (8*sizeof(((a *)0)->v[0])) + +/** @namespace r123 + Most of the Random123 C++ API is contained in the r123 namespace. +*/ + +#endif + diff --git a/src/external/Random123-1.08/include/Random123/features/clangfeatures.h b/src/external/Random123-1.08/include/Random123/features/clangfeatures.h new file mode 100644 index 0000000000..908aee8b0b --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/features/clangfeatures.h @@ -0,0 +1,76 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __clangfeatures_dot_hpp +#define __clangfeatures_dot_hpp + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H ((defined(__x86_64__)||defined(__i386__))) +#endif + +#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS +#define R123_USE_CXX11_UNRESTRICTED_UNIONS __has_feature(cxx_unrestricted_unions) +#endif + +#ifndef R123_USE_CXX11_STATIC_ASSERT +#define R123_USE_CXX11_STATIC_ASSERT __has_feature(cxx_static_assert) +#endif + +#ifndef R123_USE_CXX11_CONSTEXPR +#define R123_USE_CXX11_CONSTEXPR __has_feature(cxx_constexpr) +#endif + +#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS +#define R123_USE_CXX11_EXPLICIT_CONVERSIONS __has_feature(cxx_explicit_conversions) +#endif + +// With clang-3.0, the apparently simpler: +// #define R123_USE_CXX11_RANDOM __has_include() +// dumps core. +#ifndef R123_USE_CXX11_RANDOM +#if __cplusplus>=201103L && __has_include() +#define R123_USE_CXX11_RANDOM 1 +#else +#define R123_USE_CXX11_RANDOM 0 +#endif +#endif + +#ifndef R123_USE_CXX11_TYPE_TRAITS +#if __cplusplus>=201103L && __has_include() +#define R123_USE_CXX11_TYPE_TRAITS 1 +#else +#define R123_USE_CXX11_TYPE_TRAITS 0 +#endif +#endif + +#include "gccfeatures.h" + +#endif diff --git a/src/external/Random123-1.08/include/Random123/features/compilerfeatures.h b/src/external/Random123-1.08/include/Random123/features/compilerfeatures.h new file mode 100644 index 0000000000..4039790823 --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/features/compilerfeatures.h @@ -0,0 +1,320 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +/** + +@page porting Preprocessor symbols for porting Random123 to different platforms. + +The Random123 library is portable across C, C++, CUDA, OpenCL environments, +and multiple operating systems (Linux, Windows 7, Mac OS X, FreeBSD, Solaris). +This level of portability requires the abstraction of some features +and idioms that are either not standardized (e.g., asm statments), or for which +different vendors have their own standards (e.g., SSE intrinsics) or for +which vendors simply refuse to conform to well-established standards (e.g., ). + +Random123/features/compilerfeatures.h +conditionally includes a compiler-or-OS-specific Random123/featires/XXXfeatures.h file which +defines appropriate values for the preprocessor symbols which can be used with +a specific compiler or OS. Those symbols will then +be used by other header files and source files in the Random123 +library (and may be used by applications) to control what actually +gets presented to the compiler. + +Most of the symbols are boolean valued. In general, they will +\b always be defined with value either 1 or 0, so do +\b NOT use \#ifdef. Use \#if R123_USE_SOMETHING instead. + +Library users can override any value by defining the pp-symbol with a compiler option, +e.g., + + cc -DR123_USE_MULHILO64_C99 + +will use a strictly c99 version of the full-width 64x64->128-bit multiplication +function, even if it would be disabled by default. + +All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.h start with the prefix R123_USE_ +@verbatim + AES_NI + AES_OPENSSL + SSE4_2 + SSE4_1 + SSE + + STD_RANDOM + + GNU_UINT128 + ASM_GNU + ASM_MSASM + + CPUID_MSVC + + CXX11_RANDOM + CXX11_TYPE_TRAITS + CXX11_STATIC_ASSERT + CXX11_CONSTEXPR + CXX11_UNRESTRICTED_UNIONS + CXX11_EXPLICIT_CONVERSIONS + CXX11_LONG_LONG + CXX11 + + X86INTRIN_H + IA32INTRIN_H + XMMINTRIN_H + EMMINTRIN_H + SMMINTRIN_H + WMMINTRIN_H + INTRIN_H + + MULHILO32_ASM + MULHILO64_ASM + MULHILO64_MSVC_INTRIN + MULHILO64_CUDA_INTRIN + MULHILO64_OPENCL_INTRIN + MULHILO64_C99 + + U01_DOUBLE + +@endverbatim +Most have obvious meanings. Some non-obvious ones: + +AES_NI and AES_OPENSSL are not mutually exclusive. You can have one, +both or neither. + +GNU_UINT128 says that it's safe to use __uint128_t, but it +does not require its use. In particular, it should be +used in mulhilo only if MULHILO64_ASM is unset. + +If the XXXINTRIN_H macros are true, then one should +@code +#include +@endcode +to gain accesss to compiler intrinsics. + +The CXX11_SOME_FEATURE macros allow the code to use specific +features of the C++11 language and library. The catchall +In the absence of a specific CXX11_SOME_FEATURE, the feature +is controlled by the catch-all R123_USE_CXX11 macro. + +U01_DOUBLE defaults on, and can be turned off (set to 0) +if one does not want the utility functions that convert to double +(i.e. u01_*_53()), e.g. on OpenCL without the cl_khr_fp64 extension. + +There are a number of invariants that are always true. Application code may +choose to rely on these: + +
    +
  • ASM_GNU and ASM_MASM are mutually exclusive +
  • The "higher" SSE values imply the lower ones. +
+ +There are also non-boolean valued symbols: + +
    +
  • R123_STATIC_INLINE - + According to both C99 and GNU99, the 'static inline' declaration allows + the compiler to not emit code if the function is not used. + Note that the semantics of 'inline', 'static' and 'extern' in + gcc have changed over time and are subject to modification by + command line options, e.g., -std=gnu89, -fgnu-inline. + Nevertheless, it appears that the meaning of 'static inline' + has not changed over time and (with a little luck) the use of 'static inline' + here will be portable between versions of gcc and to other C99 + compilers. + See: http://gcc.gnu.org/onlinedocs/gcc/Inline.html + http://www.greenend.org.uk/rjk/2003/03/inline.html + +
  • R123_FORCE_INLINE(decl) - + which expands to 'decl', adorned with the compiler-specific + embellishments to strongly encourage that the declared function be + inlined. If there is no such compiler-specific magic, it should + expand to decl, unadorned. + +
  • R123_CUDA_DEVICE - which expands to __device__ (or something else with + sufficiently similar semantics) when CUDA is in use, and expands + to nothing in other cases. + +
  • R123_ASSERT(x) - which expands to assert(x), or maybe to nothing at + all if we're in an environment so feature-poor that you can't even + call assert (I'm looking at you, CUDA and OpenCL), or even include + assert.h safely (OpenCL). + +
  • R123_STATIC_ASSERT(expr,msg) - which expands to + static_assert(expr,msg), or to an expression that + will raise a compile-time exception if expr is not true. + +
  • R123_ULONG_LONG - which expands to a declaration of the longest available + unsigned integer. + +
  • R123_64BIT(x) - expands to something equivalent to + UINT64_C(x) from , even in environments where + is not available, e.g., MSVC and OpenCL. + +
  • R123_BUILTIN_EXPECT(expr,likely_value) - expands to something with + the semantics of gcc's __builtin_expect(expr,likely_value). If + the environment has nothing like __builtin_expect, it should expand + to just expr. +
+ + +\cond HIDDEN_FROM_DOXYGEN +*/ + +/* +N.B. When something is added to the list of features, it should be +added to each of the *features.h files, AND to examples/ut_features.cpp. +*/ + +/* N.B. most other compilers (icc, nvcc, open64, llvm) will also define __GNUC__, so order matters. */ +#if defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0 +#include "openclfeatures.h" +#elif defined(__CUDACC__) +#include "nvccfeatures.h" +#elif defined(__ICC) +#include "iccfeatures.h" +#elif defined(__xlC__) +#include "xlcfeatures.h" +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +#include "sunprofeatures.h" +#elif defined(__OPEN64__) +#include "open64features.h" +#elif defined(__clang__) +#include "clangfeatures.h" +#elif defined(__GNUC__) +#include "gccfeatures.h" +#elif defined(__PGI) +#include "pgccfeatures.h" +#elif defined(_MSC_FULL_VER) +#include "msvcfeatures.h" +#else +#error "Can't identify compiler. You'll need to add a new xxfeatures.hpp" +{ /* maybe an unbalanced brace will terminate the compilation */ +#endif + +#ifndef R123_USE_CXX11 +#define R123_USE_CXX11 (__cplusplus >= 201103L) +#endif + +#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS +#define R123_USE_CXX11_UNRESTRICTED_UNIONS R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_STATIC_ASSERT +#define R123_USE_CXX11_STATIC_ASSERT R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_CONSTEXPR +#define R123_USE_CXX11_CONSTEXPR R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS +#define R123_USE_CXX11_EXPLICIT_CONVERSIONS R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_RANDOM +#define R123_USE_CXX11_RANDOM R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_TYPE_TRAITS +#define R123_USE_CXX11_TYPE_TRAITS R123_USE_CXX11 +#endif + +#ifndef R123_USE_CXX11_LONG_LONG +#define R123_USE_CXX11_LONG_LONG R123_USE_CXX11 +#endif + +#ifndef R123_USE_MULHILO64_C99 +#define R123_USE_MULHILO64_C99 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#define R123_USE_MULHILO64_MULHI_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO32_MULHI_INTRIN +#define R123_USE_MULHILO32_MULHI_INTRIN 0 +#endif + +#ifndef R123_STATIC_ASSERT +#if R123_USE_CXX11_STATIC_ASSERT +#define R123_STATIC_ASSERT(expr, msg) static_assert(expr, msg) +#else + /* if msg always_looked_like_this, we could paste it into the name. Worth it? */ +#define R123_STATIC_ASSERT(expr, msg) typedef char static_assertion[(!!(expr))*2-1] +#endif +#endif + +#ifndef R123_CONSTEXPR +#if R123_USE_CXX11_CONSTEXPR +#define R123_CONSTEXPR constexpr +#else +#define R123_CONSTEXPR +#endif +#endif + +#ifndef R123_USE_PHILOX_64BIT +#define R123_USE_PHILOX_64BIT (R123_USE_MULHILO64_ASM || R123_USE_MULHILO64_MSVC_INTRIN || R123_USE_MULHILO64_CUDA_INTRIN || R123_USE_GNU_UINT128 || R123_USE_MULHILO64_C99 || R123_USE_MULHILO64_OPENCL_INTRIN || R123_USE_MULHILO64_MULHI_INTRIN) +#endif + +#ifndef R123_ULONG_LONG +#if defined(__cplusplus) && !R123_USE_CXX11_LONG_LONG +/* C++98 doesn't have long long. It doesn't have uint64_t either, but + we will have typedef'ed uint64_t to something in the xxxfeatures.h. + With luck, it won't elicit complaints from -pedantic. Cross your + fingers... */ +#define R123_ULONG_LONG uint64_t +#else +#define R123_ULONG_LONG unsigned long long +#endif +#endif + +/* UINT64_C should have been #defined by XXXfeatures.h, either by + #include or through compiler-dependent hacks */ +#ifndef R123_64BIT +#define R123_64BIT(x) UINT64_C(x) +#endif + +#ifndef R123_THROW +#define R123_THROW(x) throw (x) +#endif + +/* + * Windows.h (and perhaps other "well-meaning" code define min and + * max, so there's a high chance that our definition of min, max + * methods or use of std::numeric_limits min and max will cause + * complaints in any program that happened to include Windows.h or + * suchlike first. We use the null macro below in our own header + * files definition or use of min, max to defensively preclude + * this problem. It may not be enough; one might need to #define + * NOMINMAX before including Windows.h or compile with -DNOMINMAX. + */ +#define R123_NO_MACRO_SUBST + +/** \endcond */ diff --git a/src/external/Random123-1.08/include/Random123/features/gccfeatures.h b/src/external/Random123-1.08/include/Random123/features/gccfeatures.h new file mode 100644 index 0000000000..d6bb06088d --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/features/gccfeatures.h @@ -0,0 +1,247 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __gccfeatures_dot_hpp +#define __gccfeatures_dot_hpp + +#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) + +#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__) +# error "This code has only been tested on x86 and powerpc platforms." +#include +{ /* maybe an unbalanced brace will terminate the compilation */ + /* Feel free to try the Random123 library on other architectures by changing + the conditions that reach this error, but you should consider it a + porting exercise and expect to encounter bugs and deficiencies. + Please let the authors know of any successes (or failures). */ +#endif + +#ifdef __powerpc__ +#include +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static __inline__ +#endif + +#ifndef R123_FORCE_INLINE +#if R123_GNUC_VERSION >= 40000 +#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) +#else +#define R123_FORCE_INLINE(decl) decl +#endif +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely) +#endif + +/* According to the C++0x standard, we should be able to test the numeric + value of __cplusplus == 199701L for C++98, __cplusplus == 201103L for C++0x + But gcc has had an open bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=1773 + since early 2001, which was finally fixed in 4.7 (early 2012). For + earlier versions, the only way to detect whether --std=c++0x was requested + on the command line is to look at the __GCC_EXPERIMENTAL_CXX0X__ pp-symbol. +*/ +#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && defined(__GCC_EXPERIMENTAL_CXX0X__) )) + +#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS +#define R123_USE_CXX11_UNRESTRICTED_UNIONS ((R123_GNUC_VERSION >= 40600) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_STATIC_ASSERT +#define R123_USE_CXX11_STATIC_ASSERT ((R123_GNUC_VERSION >= 40300) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_CONSTEXPR +#define R123_USE_CXX11_CONSTEXPR ((R123_GNUC_VERSION >= 40600) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS +#define R123_USE_CXX11_EXPLICIT_CONVERSIONS ((R123_GNUC_VERSION >= 40500) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_RANDOM +#define R123_USE_CXX11_RANDOM ((R123_GNUC_VERSION>=40500) && GNU_CXX11) +#endif + +#ifndef R123_USE_CXX11_TYPE_TRAITS +#define R123_USE_CXX11_TYPE_TRAITS ((R123_GNUC_VERSION>=40400) && GNU_CXX11) +#endif + +#ifndef R123_USE_AES_NI +#ifdef __AES__ +#define R123_USE_AES_NI 1 +#else +#define R123_USE_AES_NI 0 +#endif +#endif + +#ifndef R123_USE_SSE4_2 +#ifdef __SSE4_2__ +#define R123_USE_SSE4_2 1 +#else +#define R123_USE_SSE4_2 0 +#endif +#endif + +#ifndef R123_USE_SSE4_1 +#ifdef __SSE4_1__ +#define R123_USE_SSE4_1 1 +#else +#define R123_USE_SSE4_1 0 +#endif +#endif + +#ifndef R123_USE_SSE +/* There's no point in trying to compile SSE code in Random123 + unless SSE2 is available. */ +#ifdef __SSE2__ +#define R123_USE_SSE 1 +#else +#define R123_USE_SSE 0 +#endif +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#ifdef __x86_64__ +#define R123_USE_GNU_UINT128 1 +#else +#define R123_USE_GNU_UINT128 0 +#endif +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU (defined(__x86_64__)||defined(__i386__)) +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H ((defined(__x86_64__)||defined(__i386__)) && R123_GNUC_VERSION >= 40402) +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +/* gcc -m64 on Solaris 10 defines __SSE2__ but doesn't have + emmintrin.h in the include search path. This is + so broken that I refuse to try to work around it. If this + affects you, figure out where your emmintrin.h lives and + add an appropriate -I to your CPPFLAGS. Or add -DR123_USE_SSE=0. */ +#define R123_USE_EMMINTRIN_H (R123_USE_SSE && (R123_GNUC_VERSION < 40402)) +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H ((R123_USE_SSE4_1 || R123_USE_SSE4_2) && (R123_GNUC_VERSION < 40402)) +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#define R123_USE_MULHILO64_MULHI_INTRIN (defined(__powerpc64__)) +#endif + +#ifndef R123_MULHILO64_MULHI_INTRIN +#define R123_MULHILO64_MULHI_INTRIN __mulhdu +#endif + +#ifndef R123_USE_MULHILO32_MULHI_INTRIN +#define R123_USE_MULHILO32_MULHI_INTRIN 0 +#endif + +#ifndef R123_MULHILO32_MULHI_INTRIN +#define R123_MULHILO32_MULHI_INTRIN __mulhwu +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +/* If you add something, it must go in all the other XXfeatures.hpp + and in ../ut_features.cpp */ +#endif diff --git a/src/external/Random123-1.08/include/Random123/features/iccfeatures.h b/src/external/Random123-1.08/include/Random123/features/iccfeatures.h new file mode 100644 index 0000000000..b64e5c2299 --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/features/iccfeatures.h @@ -0,0 +1,208 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __icpcfeatures_dot_hpp +#define __icpcfeatures_dot_hpp + +// icc relies on gcc libraries and other toolchain components. +#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) + +#if !defined(__x86_64__) && !defined(__i386__) +# error "This code has only been tested on x86 platforms." +{ // maybe an unbalanced brace will terminate the compilation +// You are invited to try Easy123 on other architectures, by changing +// the conditions that reach this error, but you should consider it a +// porting exercise and expect to encounter bugs and deficiencies. +// Please let the authors know of any successes (or failures). +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely) +#endif + +// The basic idiom is: +// #ifndef R123_SOMETHING +// #if some condition +// #define R123_SOMETHING 1 +// #else +// #define R123_SOMETHING 0 +// #endif +// #endif +// This idiom allows an external user to override any decision +// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0 + +// An alternative idiom is: +// #ifndef R123_SOMETHING +// #define R123_SOMETHING (some boolean expression) +// #endif +// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE +// pp-symbols. + +#ifndef R123_USE_SSE4_2 +#ifdef __SSE4_2__ +#define R123_USE_SSE4_2 1 +#else +#define R123_USE_SSE4_2 0 +#endif +#endif + +#ifndef R123_USE_SSE4_1 +#ifdef __SSE4_1__ +#define R123_USE_SSE4_1 1 +#else +#define R123_USE_SSE4_1 0 +#endif +#endif + +#ifndef R123_USE_SSE +#ifdef __SSE2__ +#define R123_USE_SSE 1 +#else +#define R123_USE_SSE 0 +#endif +#endif + +#ifndef R123_USE_AES_NI +// Unlike gcc, icc (version 12) does not pre-define an __AES__ +// pp-symbol when -maes or -xHost is on the command line. This feels +// like a defect in icc (it defines __SSE4_2__ in analogous +// circumstances), but until Intel fixes it, we're better off erring +// on the side of caution and not generating instructions that are +// going to raise SIGILL when executed. To get the AES-NI +// instructions with icc, the caller must puts something like +// -DR123_USE_AES_NI=1 or -D__AES__ on the command line. FWIW, the +// AES-NI Whitepaper by Gueron says that icc has supported AES-NI from +// 11.1 onwards. +// +#define R123_USE_AES_NI ((__ICC>=1101) && defined(__AES__)) +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 1 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 1 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 1 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 1 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 1 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 0 +#endif + +#ifndef R123_USE_MULHILO16_ASM +#define R123_USE_MULHILO16_ASM 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +// If you add something, it must go in all the other XXfeatures.hpp +// and in ../ut_features.cpp +#endif diff --git a/src/external/Random123-1.08/include/Random123/features/msvcfeatures.h b/src/external/Random123-1.08/include/Random123/features/msvcfeatures.h new file mode 100644 index 0000000000..9eb9520912 --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/features/msvcfeatures.h @@ -0,0 +1,200 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __msvcfeatures_dot_hpp +#define __msvcfeatures_dot_hpp + +//#if _MSVC_FULL_VER <= 15 +//#error "We've only tested MSVC_FULL_VER==15." +//#endif + +#if !defined(_M_IX86) && !defined(_M_X64) +# error "This code has only been tested on x86 platforms." +{ // maybe an unbalanced brace will terminate the compilation +// You are invited to try Random123 on other architectures, by changing +// the conditions that reach this error, but you should consider it a +// porting exercise and expect to encounter bugs and deficiencies. +// Please let the authors know of any successes (or failures). +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static __inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) _forceinline decl +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +// The basic idiom is: +// #ifndef R123_SOMETHING +// #if some condition +// #define R123_SOMETHING 1 +// #else +// #define R123_SOMETHING 0 +// #endif +// #endif +// This idiom allows an external user to override any decision +// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0 + +// An alternative idiom is: +// #ifndef R123_SOMETHING +// #define R123_SOMETHING (some boolean expression) +// #endif +// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE +// pp-symbols. + +#ifndef R123_USE_AES_NI +#if defined(_M_X64) +#define R123_USE_AES_NI 1 +#else +#define R123_USE_AES_NI 0 +#endif +#endif + +#ifndef R123_USE_SSE4_2 +#if defined(_M_X64) +#define R123_USE_SSE4_2 1 +#else +#define R123_USE_SSE4_2 0 +#endif +#endif + +#ifndef R123_USE_SSE4_1 +#if defined(_M_X64) +#define R123_USE_SSE4_1 1 +#else +#define R123_USE_SSE4_1 0 +#endif +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 1 +#endif + +#ifndef R123_USE_AES_OPENSSL +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 0 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 1 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 1 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 1 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 1 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 1 +#endif + +#ifndef R123_USE_MULHILO16_ASM +#define R123_USE_MULHILO16_ASM 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#if defined(_M_X64) +#define R123_USE_MULHILO64_MSVC_INTRIN 1 +#else +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +#pragma warning(disable:4244) +#pragma warning(disable:4996) + +// If you add something, it must go in all the other XXfeatures.hpp +// and in ../ut_features.cpp +#endif diff --git a/src/external/Random123-1.08/include/Random123/features/nvccfeatures.h b/src/external/Random123-1.08/include/Random123/features/nvccfeatures.h new file mode 100644 index 0000000000..711babf88e --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/features/nvccfeatures.h @@ -0,0 +1,110 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __r123_nvcc_features_dot_h__ +#define __r123_nvcc_features_dot_h__ + +#if !defined(CUDART_VERSION) +#error "why are we in nvccfeatures.h if CUDART_VERSION is not defined" +#endif + +#if CUDART_VERSION < 4010 +#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces. Random123 isunsupported. See comments in nvccfeatures.h" +// This test was added in Random123-1.08 (August, 2013) because we +// discovered that Ftype(maxTvalue()) with Ftype=double and +// T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and +// earlier. We can't be sure this bug doesn't also affect invocations +// of other templated functions, e.g., essentially all of Random123. +// Thus, we no longer trust CUDA versions earlier than 4.1 even though +// we had previously tested and timed Random123 with CUDA 3.x and 4.0. +// If you feel lucky or desperate, you can change #error to #warning, but +// please take extra care to be sure that you are getting correct +// results. +#endif + +// nvcc falls through to gcc or msvc. So first define +// a couple of things and then include either gccfeatures.h +// or msvcfeatures.h + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE __device__ +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 1 +#endif + +#ifndef R123_ASSERT +#define R123_ASSERT(x) if((x)) ; else asm("trap;") +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_ULONG_LONG +// uint64_t, which is what we'd get without this, is +// not the same as unsigned long long +#define R123_ULONG_LONG unsigned long long +#endif + +#ifndef R123_THROW +// No exceptions in CUDA, at least upto 4.0 +#define R123_THROW(x) R123_ASSERT(0) +#endif + +#if defined(__GNUC__) +#include "gccfeatures.h" +#elif defined(_MSC_FULL_VER) +#include "msvcfeatures.h" +#endif + +#endif diff --git a/src/external/Random123-1.08/include/Random123/features/open64features.h b/src/external/Random123-1.08/include/Random123/features/open64features.h new file mode 100644 index 0000000000..8da9f5f51e --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/features/open64features.h @@ -0,0 +1,50 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __open64features_dot_hpp +#define __open64features_dot_hpp + +/* The gcc features are mostly right. We just override a few and then include gccfeatures.h */ + +/* Open64 4.2.3 and 4.2.4 accept the __uint128_t code without complaint + but produce incorrect code for 64-bit philox. The MULHILO64_ASM + seems to work fine */ +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#include "gccfeatures.h" + +#endif diff --git a/src/external/Random123-1.08/include/Random123/features/openclfeatures.h b/src/external/Random123-1.08/include/Random123/features/openclfeatures.h new file mode 100644 index 0000000000..af03d30923 --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/features/openclfeatures.h @@ -0,0 +1,89 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __openclfeatures_dot_hpp +#define __openclfeatures_dot_hpp + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#define R123_ASSERT(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 1 +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +// XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of +// ulong to mul_hi. And gets lots of complaints from stdint.h +// on some machines. +// But these typedefs mean we cannot include stdint.h with +// these headers? Do we need R123_64T, R123_32T, R123_8T? +typedef ulong uint64_t; +typedef uint uint32_t; +typedef uchar uint8_t; +#define UINT64_C(x) ((ulong)(x##UL)) + +#endif diff --git a/src/external/Random123-1.08/include/Random123/features/pgccfeatures.h b/src/external/Random123-1.08/include/Random123/features/pgccfeatures.h new file mode 100644 index 0000000000..18ace1353b --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/features/pgccfeatures.h @@ -0,0 +1,194 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Copyright (c) 2013, Los Alamos National Security, LLC +All rights reserved. + +Copyright 2013. Los Alamos National Security, LLC. This software was produced +under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National +Laboratory (LANL), which is operated by Los Alamos National Security, LLC for +the U.S. Department of Energy. The U.S. Government has rights to use, +reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS +ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR +ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified +to produce derivative works, such modified software should be clearly marked, +so as not to confuse it with the version available from LANL. +*/ +#ifndef __pgccfeatures_dot_hpp +#define __pgccfeatures_dot_hpp + +#if !defined(__x86_64__) && !defined(__i386__) +# error "This code has only been tested on x86 platforms." +#include +{ /* maybe an unbalanced brace will terminate the compilation */ + /* Feel free to try the Random123 library on other architectures by changing + the conditions that reach this error, but you should consider it a + porting exercise and expect to encounter bugs and deficiencies. + Please let the authors know of any successes (or failures). */ +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +/* Found this example in PGI's emmintrin.h. */ +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) (expr) +#endif + +/* PGI through 13.2 doesn't appear to support AES-NI. */ +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +/* PGI through 13.2 appears to support MMX, SSE, SSE3, SSE3, SSSE3, SSE4a, and + ABM, but not SSE4.1 or SSE4.2. */ +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +/* There's no point in trying to compile SSE code in Random123 + unless SSE2 is available. */ +#ifdef __SSE2__ +#define R123_USE_SSE 1 +#else +#define R123_USE_SSE 0 +#endif +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 1 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +/* emmintrin.h from PGI #includes xmmintrin.h but then complains at link time + about undefined references to _mm_castsi128_ps(__m128i). Why? */ +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 1 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 1 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 0 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#ifdef __ABM__ +#define R123_USE_INTRIN_H 1 +#else +#define R123_USE_INTRIN_H 0 +#endif +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#define R123_USE_MULHILO64_MULHI_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +/* If you add something, it must go in all the other XXfeatures.hpp + and in ../ut_features.cpp */ +#endif diff --git a/src/external/Random123-1.08/include/Random123/features/sse.h b/src/external/Random123-1.08/include/Random123/features/sse.h new file mode 100644 index 0000000000..88efd65f10 --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/features/sse.h @@ -0,0 +1,280 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _Random123_sse_dot_h__ +#define _Random123_sse_dot_h__ + +#if R123_USE_SSE + +#if R123_USE_X86INTRIN_H +#include +#endif +#if R123_USE_IA32INTRIN_H +#include +#endif +#if R123_USE_XMMINTRIN_H +#include +#endif +#if R123_USE_EMMINTRIN_H +#include +#endif +#if R123_USE_SMMINTRIN_H +#include +#endif +#if R123_USE_WMMINTRIN_H +#include +#endif +#if R123_USE_INTRIN_H +#include +#endif +#ifdef __cplusplus +#include +#include +#include +#endif + +#if R123_USE_ASM_GNU + +/* bit25 of CX tells us whether AES is enabled. */ +R123_STATIC_INLINE int haveAESNI(){ + unsigned int eax, ebx, ecx, edx; + __asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : + "a" (1)); + return (ecx>>25) & 1; +} +#elif R123_USE_CPUID_MSVC +R123_STATIC_INLINE int haveAESNI(){ + int CPUInfo[4]; + __cpuid(CPUInfo, 1); + return (CPUInfo[2]>>25)&1; +} +#else /* R123_USE_CPUID_??? */ +#warning "No R123_USE_CPUID_XXX method chosen. haveAESNI will always return false" +R123_STATIC_INLINE int haveAESNI(){ + return 0; +} +#endif /* R123_USE_ASM_GNU || R123_USE_CPUID_MSVC */ + +// There is a lot of annoying and inexplicable variation in the +// SSE intrinsics available in different compilation environments. +// The details seem to depend on the compiler, the version and +// the target architecture. Rather than insisting on +// R123_USE_feature tests for each of these in each of the +// compilerfeatures.h files we just keep the complexity localized +// to here... +#if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64)) +/* Is there an intrinsic to assemble an __m128i from two 64-bit words? + If not, use the 4x32-bit intrisic instead. N.B. It looks like Intel + added _mm_set_epi64x to icc version 12.1 in Jan 2012. +*/ +R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){ + union{ + uint64_t u64; + uint32_t u32[2]; + } u1, u0; + u1.u64 = v1; + u0.u64 = v0; + return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]); +} +#endif +/* _mm_extract_lo64 abstracts the task of extracting the low 64-bit + word from an __m128i. The _mm_cvtsi128_si64 intrinsic does the job + on 64-bit platforms. Unfortunately, both MSVC and Open64 fail + assertions in ut_M128.cpp and ut_carray.cpp when we use the + _mm_cvtsi128_si64 intrinsic. (See + https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug). + On 32-bit platforms, there's no MOVQ, so there's no intrinsic. + Finally, even if the intrinsic exists, it may be spelled with or + without the 'x'. +*/ +#if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__) +R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){ + union{ + uint64_t u64[2]; + __m128i m; + }u; + _mm_store_si128(&u.m, si); + return u.u64[0]; +} +#elif defined(__llvm__) || defined(__ICC) +R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){ + return (uint64_t)_mm_cvtsi128_si64(si); +} +#else /* GNUC, others */ +/* FWIW, gcc's emmintrin.h has had the 'x' spelling + since at least gcc-3.4.4. The no-'x' spelling showed up + around 4.2. */ +R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){ + return (uint64_t)_mm_cvtsi128_si64x(si); +} +#endif +#if defined(__GNUC__) && __GNUC__ < 4 +/* the cast builtins showed up in gcc4. */ +R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){ + return (__m128)si; +} +#endif + +#ifdef __cplusplus + +struct r123m128i{ + __m128i m; +#if R123_USE_CXX11_UNRESTRICTED_UNIONS + // C++98 forbids a union member from having *any* constructors. + // C++11 relaxes this, and allows union members to have constructors + // as long as there is a "trivial" default construtor. So in C++11 + // we can provide a r123m128i constructor with an __m128i argument, and still + // have the default (and hence trivial) default constructor. + r123m128i() = default; + r123m128i(__m128i _m): m(_m){} +#endif + r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;} + r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;} +#if R123_USE_CXX11_EXPLICIT_CONVERSIONS + // With C++0x we can attach explicit to the bool conversion operator + // to disambiguate undesired promotions. For g++, this works + // only in 4.5 and above. + explicit operator bool() const {return _bool();} +#else + // Pre-C++0x, we have to do something else. Google for the "safe bool" + // idiom for other ideas... + operator const void*() const{return _bool()?this:0;} +#endif + operator __m128i() const {return m;} + +private: +#if R123_USE_SSE4_1 + bool _bool() const{ return !_mm_testz_si128(m,m); } +#else + bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); } +#endif +}; + +R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){ + __m128i& c = v.m; + __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1)); + c = _mm_add_epi64(c, zeroone); + //return c; +#if R123_USE_SSE4_1 + __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0))); + if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){ + __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0)); + c = _mm_add_epi64(c, onezero); + } +#else + unsigned mask = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128()))); + // The low two bits of mask are 11 iff the low 64 bits of + // c are zero. + if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){ + __m128i onezero = _mm_set_epi64x(1,0); + c = _mm_add_epi64(c, onezero); + } +#endif + return v; +} + +R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){ + __m128i c = lhs.m; + __m128i incr128 = _mm_set_epi64x(0, n); + c = _mm_add_epi64(c, incr128); + // return c; // NO CARRY! + + int64_t lo64 = _mm_extract_lo64(c); + if((uint64_t)lo64 < n) + c = _mm_add_epi64(c, _mm_set_epi64x(1,0)); + lhs.m = c; + return lhs; +} + +// We need this one because it's present, but never used in r123array1xm128i::incr +R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){ + throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");} + +// The comparisons aren't implemented, but if we leave them out, and +// somebody writes, e.g., M1 < M2, the compiler will do an implicit +// conversion through void*. Sigh... +R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){ + throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");} +R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){ + throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");} +R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){ + throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");} +R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){ + throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");} + +R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){ + return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); } +R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){ + return !(lhs==rhs);} +R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){ + r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; } +R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){ + return !(lhs==rhs);} +R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){ + union{ + uint64_t u64[2]; + __m128i m; + }u; + _mm_storeu_si128(&u.m, m.m); + return os << u.u64[0] << " " << u.u64[1]; +} + +R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){ + uint64_t u64[2]; + is >> u64[0] >> u64[1]; + m.m = _mm_set_epi64x(u64[1], u64[0]); + return is; +} + +template inline T assemble_from_u32(uint32_t *p32); // forward declaration + +template <> +inline r123m128i assemble_from_u32(uint32_t *p32){ + r123m128i ret; + ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]); + return ret; +} + +#else + +typedef struct { + __m128i m; +} r123m128i; + +#endif /* __cplusplus */ + +#else /* !R123_USE_SSE */ +R123_STATIC_INLINE int haveAESNI(){ + return 0; +} +#endif /* R123_USE_SSE */ + +#endif /* _Random123_sse_dot_h__ */ diff --git a/src/external/Random123-1.08/include/Random123/features/sunprofeatures.h b/src/external/Random123-1.08/include/Random123/features/sunprofeatures.h new file mode 100644 index 0000000000..c9cdc00f5e --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/features/sunprofeatures.h @@ -0,0 +1,172 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __sunprofeatures_dot_hpp +#define __sunprofeatures_dot_hpp + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +// The basic idiom is: +// #ifndef R123_SOMETHING +// #if some condition +// #define R123_SOMETHING 1 +// #else +// #define R123_SOMETHING 0 +// #endif +// #endif +// This idiom allows an external user to override any decision +// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0 + +// An alternative idiom is: +// #ifndef R123_SOMETHING +// #define R123_SOMETHING (some boolean expression) +// #endif +// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE +// pp-symbols. + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 0 +#endif + +#ifndef R123_USE_AES_OPENSSL +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 0 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 0 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 0 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 0 +#endif + +#ifndef R123_USE_MULHILO16_ASM +#define R123_USE_MULHILO16_ASM 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef R123_USE_PHILOX_64BIT +#define R123_USE_PHILOX_64BIT 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +// If you add something, it must go in all the other XXfeatures.hpp +// and in ../ut_features.cpp +#endif diff --git a/src/external/Random123-1.08/include/Random123/features/xlcfeatures.h b/src/external/Random123-1.08/include/Random123/features/xlcfeatures.h new file mode 100644 index 0000000000..a5c8412a44 --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/features/xlcfeatures.h @@ -0,0 +1,202 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Copyright (c) 2013, Los Alamos National Security, LLC +All rights reserved. + +Copyright 2013. Los Alamos National Security, LLC. This software was produced +under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National +Laboratory (LANL), which is operated by Los Alamos National Security, LLC for +the U.S. Department of Energy. The U.S. Government has rights to use, +reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS +ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR +ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified +to produce derivative works, such modified software should be clearly marked, +so as not to confuse it with the version available from LANL. +*/ +#ifndef __xlcfeatures_dot_hpp +#define __xlcfeatures_dot_hpp + +#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__) +# error "This code has only been tested on x86 and PowerPC platforms." +#include +{ /* maybe an unbalanced brace will terminate the compilation */ + /* Feel free to try the Random123 library on other architectures by changing + the conditions that reach this error, but you should consider it a + porting exercise and expect to encounter bugs and deficiencies. + Please let the authors know of any successes (or failures). */ +#endif + +#ifdef __cplusplus +/* builtins are automatically available to xlc. To use them with xlc++, + one must include builtins.h. c.f + http://publib.boulder.ibm.com/infocenter/cellcomp/v101v121/index.jsp?topic=/com.ibm.xlcpp101.cell.doc/compiler_ref/compiler_builtins.html +*/ +#include +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely) +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 0 +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 1 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 0 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 0 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#ifdef __ABM__ +#define R123_USE_INTRIN_H 1 +#else +#define R123_USE_INTRIN_H 0 +#endif +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#define R123_USE_MULHILO64_MULHI_INTRIN (defined(__powerpc64__)) +#endif + +#ifndef R123_MULHILO64_MULHI_INTRIN +#define R123_MULHILO64_MULHI_INTRIN __mulhdu +#endif + +#ifndef R123_USE_MULHILO32_MULHI_INTRIN +#define R123_USE_MULHILO32_MULHI_INTRIN 0 +#endif + +#ifndef R123_MULHILO32_MULHI_INTRIN +#define R123_MULHILO32_MULHI_INTRIN __mulhwu +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM (defined(__powerpc64__) && !(R123_USE_MULHILO64_MULHI_INTRIN)) +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +/* If you add something, it must go in all the other XXfeatures.hpp + and in ../ut_features.cpp */ +#endif diff --git a/src/external/Random123-1.08/include/Random123/threefry.h b/src/external/Random123-1.08/include/Random123/threefry.h new file mode 100644 index 0000000000..da2de979c6 --- /dev/null +++ b/src/external/Random123-1.08/include/Random123/threefry.h @@ -0,0 +1,864 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _threefry_dot_h_ +#define _threefry_dot_h_ +#include "features/compilerfeatures.h" +#include "array.h" + +/** \cond HIDDEN_FROM_DOXYGEN */ +/* Significant parts of this file were copied from + from: + Skein_FinalRnd/ReferenceImplementation/skein.h + Skein_FinalRnd/ReferenceImplementation/skein_block.c + + in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip + + This file has been modified so that it may no longer perform its originally + intended function. If you're looking for a Skein or Threefish source code, + please consult the original file. + + The original file had the following header: +************************************************************************** +** +** Interface declarations and internal definitions for Skein hashing. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +*************************************************************************** + +*/ + +/* See comment at the top of philox.h for the macro pre-process + strategy. */ + +/* Rotation constants: */ +enum r123_enum_threefry64x4 { + /* These are the R_256 constants from the Threefish reference sources + with names changed to R_64x4... */ + R_64x4_0_0=14, R_64x4_0_1=16, + R_64x4_1_0=52, R_64x4_1_1=57, + R_64x4_2_0=23, R_64x4_2_1=40, + R_64x4_3_0= 5, R_64x4_3_1=37, + R_64x4_4_0=25, R_64x4_4_1=33, + R_64x4_5_0=46, R_64x4_5_1=12, + R_64x4_6_0=58, R_64x4_6_1=22, + R_64x4_7_0=32, R_64x4_7_1=32 +}; + +enum r123_enum_threefry64x2 { + /* + // Output from skein_rot_search: (srs64_B64-X1000) + // Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57 + // Start: Tue Mar 1 10:07:48 2011 + // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format + */ + R_64x2_0_0=16, + R_64x2_1_0=42, + R_64x2_2_0=12, + R_64x2_3_0=31, + R_64x2_4_0=16, + R_64x2_5_0=32, + R_64x2_6_0=24, + R_64x2_7_0=21 + /* 4 rounds: minHW = 4 [ 4 4 4 4 ] + // 5 rounds: minHW = 8 [ 8 8 8 8 ] + // 6 rounds: minHW = 16 [ 16 16 16 16 ] + // 7 rounds: minHW = 32 [ 32 32 32 32 ] + // 8 rounds: minHW = 64 [ 64 64 64 64 ] + // 9 rounds: minHW = 64 [ 64 64 64 64 ] + //10 rounds: minHW = 64 [ 64 64 64 64 ] + //11 rounds: minHW = 64 [ 64 64 64 64 ] */ +}; + +enum r123_enum_threefry32x4 { + /* Output from skein_rot_search: (srs-B128-X5000.out) + // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 + // Start: Mon Aug 24 22:41:36 2009 + // ... + // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */ + R_32x4_0_0=10, R_32x4_0_1=26, + R_32x4_1_0=11, R_32x4_1_1=21, + R_32x4_2_0=13, R_32x4_2_1=27, + R_32x4_3_0=23, R_32x4_3_1= 5, + R_32x4_4_0= 6, R_32x4_4_1=20, + R_32x4_5_0=17, R_32x4_5_1=11, + R_32x4_6_0=25, R_32x4_6_1=10, + R_32x4_7_0=18, R_32x4_7_1=20 + + /* 4 rounds: minHW = 3 [ 3 3 3 3 ] + // 5 rounds: minHW = 7 [ 7 7 7 7 ] + // 6 rounds: minHW = 12 [ 13 12 13 12 ] + // 7 rounds: minHW = 22 [ 22 23 22 23 ] + // 8 rounds: minHW = 31 [ 31 31 31 31 ] + // 9 rounds: minHW = 32 [ 32 32 32 32 ] + //10 rounds: minHW = 32 [ 32 32 32 32 ] + //11 rounds: minHW = 32 [ 32 32 32 32 ] */ + +}; + +enum r123_enum_threefry32x2 { + /* Output from skein_rot_search (srs32x2-X5000.out) + // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 + // Start: Tue Jul 12 11:11:33 2011 + // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */ + R_32x2_0_0=13, + R_32x2_1_0=15, + R_32x2_2_0=26, + R_32x2_3_0= 6, + R_32x2_4_0=17, + R_32x2_5_0=29, + R_32x2_6_0=16, + R_32x2_7_0=24 + + /* 4 rounds: minHW = 4 [ 4 4 4 4 ] + // 5 rounds: minHW = 6 [ 6 8 6 8 ] + // 6 rounds: minHW = 9 [ 9 12 9 12 ] + // 7 rounds: minHW = 16 [ 16 24 16 24 ] + // 8 rounds: minHW = 32 [ 32 32 32 32 ] + // 9 rounds: minHW = 32 [ 32 32 32 32 ] + //10 rounds: minHW = 32 [ 32 32 32 32 ] + //11 rounds: minHW = 32 [ 32 32 32 32 ] */ + }; + +enum r123_enum_threefry_wcnt { + WCNT2=2, + WCNT4=4 +}; +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N)); +R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N) +{ + return (x << (N & 63)) | (x >> ((64-N) & 63)); +} + +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N)); +R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N) +{ + return (x << (N & 31)) | (x >> ((32-N) & 31)); +} + +#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32)) +#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) +#define SKEIN_KS_PARITY32 0x1BD11BDA + +#ifndef THREEFRY2x32_DEFAULT_ROUNDS +#define THREEFRY2x32_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY2x64_DEFAULT_ROUNDS +#define THREEFRY2x64_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY4x32_DEFAULT_ROUNDS +#define THREEFRY4x32_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY4x64_DEFAULT_ROUNDS +#define THREEFRY4x64_DEFAULT_ROUNDS 20 +#endif + +#define _threefry2x_tpl(W) \ +typedef struct r123array2x##W threefry2x##W##_ctr_t; \ +typedef struct r123array2x##W threefry2x##W##_key_t; \ +typedef struct r123array2x##W threefry2x##W##_ukey_t; \ +R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \ + threefry2x##W##_ctr_t X; \ + uint##W##_t ks[2+1]; \ + int i; /* avoid size_t to avoid need for stddef.h */ \ + R123_ASSERT(Nrounds<=32); \ + ks[2] = SKEIN_KS_PARITY##W; \ + for (i=0;i < 2; i++) \ + { \ + ks[i] = k.v[i]; \ + X.v[i] = in.v[i]; \ + ks[2] ^= k.v[i]; \ + } \ + \ + /* Insert initial key before round 0 */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; \ + \ + if(Nrounds>0){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>1){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>2){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>3){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>3){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; \ + X.v[1] += 1; /* X.v[2-1] += r */ \ + } \ + if(Nrounds>4){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>5){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>6){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>7){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>7){ \ + /* InjectKey(r=2) */ \ + X.v[0] += ks[2]; X.v[1] += ks[0]; \ + X.v[1] += 2; \ + } \ + if(Nrounds>8){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>9){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>10){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>11){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>11){ \ + /* InjectKey(r=3) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; \ + X.v[1] += 3; \ + } \ + if(Nrounds>12){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>13){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>14){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>15){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>15){ \ + /* InjectKey(r=4) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; \ + X.v[1] += 4; \ + } \ + if(Nrounds>16){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>17){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>18){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>19){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>19){ \ + /* InjectKey(r=5) */ \ + X.v[0] += ks[2]; X.v[1] += ks[0]; \ + X.v[1] += 5; \ + } \ + if(Nrounds>20){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>21){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>22){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>23){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>23){ \ + /* InjectKey(r=6) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; \ + X.v[1] += 6; \ + } \ + if(Nrounds>24){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>25){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>26){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>27){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>27){ \ + /* InjectKey(r=7) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; \ + X.v[1] += 7; \ + } \ + if(Nrounds>28){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>29){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>30){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>31){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>31){ \ + /* InjectKey(r=8) */ \ + X.v[0] += ks[2]; X.v[1] += ks[0]; \ + X.v[1] += 8; \ + } \ + return X; \ +} \ + /** @ingroup ThreefryNxW */ \ +enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \ + return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \ +} + + +#define _threefry4x_tpl(W) \ +typedef struct r123array4x##W threefry4x##W##_ctr_t; \ +typedef struct r123array4x##W threefry4x##W##_key_t; \ +typedef struct r123array4x##W threefry4x##W##_ukey_t; \ +R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \ + threefry4x##W##_ctr_t X; \ + uint##W##_t ks[4+1]; \ + int i; /* avoid size_t to avoid need for stddef.h */ \ + R123_ASSERT(Nrounds<=72); \ + ks[4] = SKEIN_KS_PARITY##W; \ + for (i=0;i < 4; i++) \ + { \ + ks[i] = k.v[i]; \ + X.v[i] = in.v[i]; \ + ks[4] ^= k.v[i]; \ + } \ + \ + /* Insert initial key before round 0 */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + \ + if(Nrounds>0){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>1){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>2){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>3){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>3){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 1; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>4){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>5){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>6){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>7){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>7){ \ + /* InjectKey(r=2) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 2; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>8){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>9){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>10){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>11){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>11){ \ + /* InjectKey(r=3) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 3; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>12){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>13){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>14){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>15){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>15){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \ + X.v[4-1] += 4; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>16){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>17){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>18){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>19){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>19){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + X.v[4-1] += 5; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>20){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>21){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>22){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>23){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>23){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 6; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>24){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>25){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>26){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>27){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>27){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 7; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>28){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>29){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>30){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>31){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>31){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 8; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>32){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>33){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>34){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>35){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>35){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \ + X.v[4-1] += 9; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>36){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>37){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>38){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>39){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>39){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + X.v[4-1] += 10; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>40){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>41){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>42){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>43){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>43){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 11; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>44){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>45){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>46){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>47){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>47){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 12; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>48){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>49){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>50){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>51){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>51){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 13; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>52){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>53){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>54){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>55){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>55){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \ + X.v[4-1] += 14; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>56){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>57){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>58){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>59){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>59){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + X.v[4-1] += 15; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>60){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>61){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>62){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>63){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>63){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 16; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>64){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>65){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>66){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>67){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>67){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 17; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>68){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>69){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>70){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>71){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>71){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 18; /* X.v[WCNT4-1] += r */ \ + } \ + \ + return X; \ +} \ + /** @ingroup ThreefryNxW */ \ +enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \ + return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \ +} +/** \endcond */ + +_threefry2x_tpl(64) +_threefry2x_tpl(32) +_threefry4x_tpl(64) +_threefry4x_tpl(32) + +/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better + than a static inline function. Why? */ +#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k) +#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k) +#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k) +#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k) + +#ifdef __cplusplus +/** \cond HIDDEN_FROM_DOXYGEN */ +#define _threefryNxWclass_tpl(NxW) \ +namespace r123{ \ +template \ + struct Threefry##NxW##_R{ \ + typedef threefry##NxW##_ctr_t ctr_type; \ + typedef threefry##NxW##_key_t key_type; \ + typedef threefry##NxW##_key_t ukey_type; \ + static const unsigned int rounds=R; \ + inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \ + R123_STATIC_ASSERT(R<=72, "threefry is only unrolled up to 72 rounds\n"); \ + return threefry##NxW##_R(R, ctr, key); \ + } \ +}; \ + typedef Threefry##NxW##_R Threefry##NxW; \ +} // namespace r123 + +/** \endcond */ + +_threefryNxWclass_tpl(2x32) +_threefryNxWclass_tpl(4x32) +_threefryNxWclass_tpl(2x64) +_threefryNxWclass_tpl(4x64) + +/* The _tpl macros don't quite work to do string-pasting inside comments. + so we just write out the boilerplate documentation four times... */ + +/** +@defgroup ThreefryNxW Threefry Classes and Typedefs + +The ThreefryNxW classes export the member functions, typedefs and +operator overloads required by a @ref CBRNG "CBRNG" class. + +As described in +Parallel Random Numbers: As Easy as 1, 2, 3 , +the Threefry family is closely related to the Threefish block cipher from + Skein Hash Function. +Threefry is \b not suitable for cryptographic use. + +Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output. + +@class r123::Threefry2x32_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=13 or more for Threefry2x32. + +@typedef r123::Threefry2x32 +@ingroup ThreefryNxW + Threefry2x32 is equivalent to Threefry2x32_R<20>. With 20 rounds, + Threefry2x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + +@class r123::Threefry2x64_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +In November 2011, the authors discovered that 13 rounds of +Threefry2x64 sequenced by strided, interleaved key and counter +increments failed a very long (longer than the default BigCrush +length) WeightDistrub test. At the same time, it was confirmed that +14 rounds passes much longer tests (up to 5x10^12 samples) of a +similar nature. The authors know of no statistical flaws with +ROUNDS=14 or more for Threefry2x64. + +@typedef r123::Threefry2x64 +@ingroup ThreefryNxW + Threefry2x64 is equivalent to Threefry2x64_R<20>. With 20 rounds, + Threefry2x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Threefry4x32_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=12 or more for Threefry4x32. + +@typedef r123::Threefry4x32 +@ingroup ThreefryNxW + Threefry4x32 is equivalent to Threefry4x32_R<20>. With 20 rounds, + Threefry4x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Threefry4x64_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=12 or more for Threefry4x64. + +@typedef r123::Threefry4x64 +@ingroup ThreefryNxW + Threefry4x64 is equivalent to Threefry4x64_R<20>. With 20 rounds, + Threefry4x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. +*/ + +#endif + +#endif diff --git a/src/gromacs/fileio/trajectory_writing.c b/src/gromacs/fileio/trajectory_writing.c index 813c5b8036..3cea7d40f9 100644 --- a/src/gromacs/fileio/trajectory_writing.c +++ b/src/gromacs/fileio/trajectory_writing.c @@ -61,14 +61,12 @@ do_md_trajectory_writing(FILE *fplog, t_state *state_global, gmx_mtop_t *top_global, t_forcerec *fr, - gmx_update_t upd, gmx_mdoutf_t outf, t_mdebin *mdebin, gmx_ekindata_t *ekind, rvec *f, rvec *f_global, gmx_wallcycle_t wcycle, - gmx_rng_t mcrng, int *nchkpt, gmx_bool bCPT, gmx_bool bRerunMD, @@ -131,14 +129,6 @@ do_md_trajectory_writing(FILE *fplog, wallcycle_start(wcycle, ewcTRAJ); if (bCPT) { - if (state->flags & (1<flags & (1<nhchainlength*state->ngtc; nnhtp = state->nhchainlength*state->nnhpres; - if (bReadRNG) - { - rng_p = (int **)&state->ld_rng; - rngi_p = &state->ld_rngi; - } - else - { - /* Do not read the RNG data */ - rng_p = NULL; - rngi_p = NULL; - } - if (bRead) /* we need to allocate space for dfhist if we are reading */ { init_df_history(&state->dfhist, state->dfhist.nlambda); } - /* We want the MC_RNG the same across all the notes for now -- lambda MC is global */ - sflags = state->flags; for (i = 0; (i < estNR && ret == 0); i++) { @@ -1040,10 +1024,13 @@ static int do_cpt_state(XDR *xd, gmx_bool bRead, case estX: ret = do_cpte_rvecs(xd, cptpEST, i, sflags, state->natoms, &state->x, list); break; case estV: ret = do_cpte_rvecs(xd, cptpEST, i, sflags, state->natoms, &state->v, list); break; case estSDX: ret = do_cpte_rvecs(xd, cptpEST, i, sflags, state->natoms, &state->sd_X, list); break; - case estLD_RNG: ret = do_cpte_ints(xd, cptpEST, i, sflags, state->nrng, rng_p, list); break; - case estLD_RNGI: ret = do_cpte_ints(xd, cptpEST, i, sflags, state->nrngi, rngi_p, list); break; - case estMC_RNG: ret = do_cpte_ints(xd, cptpEST, i, sflags, state->nmcrng, (int **)&state->mc_rng, list); break; - case estMC_RNGI: ret = do_cpte_ints(xd, cptpEST, i, sflags, 1, &state->mc_rngi, list); break; + /* The RNG entries are no longer written, + * the next 4 lines are only for reading old files. + */ + case estLD_RNG: ret = do_cpte_ints(xd, cptpEST, i, sflags, 0, NULL, list); break; + case estLD_RNGI: ret = do_cpte_ints(xd, cptpEST, i, sflags, 0, NULL, list); break; + case estMC_RNG: ret = do_cpte_ints(xd, cptpEST, i, sflags, 0, NULL, list); break; + case estMC_RNGI: ret = do_cpte_ints(xd, cptpEST, i, sflags, 0, NULL, list); break; case estDISRE_INITF: ret = do_cpte_real (xd, cptpEST, i, sflags, &state->hist.disre_initf, list); break; case estDISRE_RM3TAV: ret = do_cpte_n_reals(xd, cptpEST, i, sflags, &state->hist.ndisrepairs, &state->hist.disre_rm3tav, list); break; case estORIRE_INITF: ret = do_cpte_real (xd, cptpEST, i, sflags, &state->hist.orire_initf, list); break; @@ -1654,7 +1641,7 @@ void write_checkpoint(const char *fn, gmx_bool bNumberAndKeep, sfree(bhost); sfree(fprog); - if ((do_cpt_state(gmx_fio_getxdr(fp), FALSE, state->flags, state, TRUE, NULL) < 0) || + if ((do_cpt_state(gmx_fio_getxdr(fp), FALSE, state->flags, state, NULL) < 0) || (do_cpt_ekinstate(gmx_fio_getxdr(fp), flags_eks, &state->ekinstate, NULL) < 0) || (do_cpt_enerhist(gmx_fio_getxdr(fp), FALSE, flags_enh, &state->enerhist, NULL) < 0) || (do_cpt_df_hist(gmx_fio_getxdr(fp), flags_dfh, &state->dfhist, NULL) < 0) || @@ -1847,7 +1834,7 @@ static void check_match(FILE *fplog, static void read_checkpoint(const char *fn, FILE **pfplog, t_commrec *cr, ivec dd_nc, int eIntegrator, int *init_fep_state, gmx_int64_t *step, double *t, - t_state *state, gmx_bool *bReadRNG, gmx_bool *bReadEkin, + t_state *state, gmx_bool *bReadEkin, int *simulation_part, gmx_bool bAppendOutputFiles, gmx_bool bForceAppend) { @@ -1875,10 +1862,6 @@ static void read_checkpoint(const char *fn, FILE **pfplog, const char *int_warn = "WARNING: The checkpoint file was generated with integrator %s,\n" " while the simulation uses integrator %s\n\n"; - const char *sd_note = - "NOTE: The checkpoint file was for %d nodes doing SD or BD,\n" - " while the simulation uses %d SD or BD nodes,\n" - " continuation will be exact, except for the random state\n\n"; #if !defined __native_client__ && !defined GMX_NATIVE_WINDOWS fl.l_type = F_WRLCK; @@ -1995,16 +1978,6 @@ static void read_checkpoint(const char *fn, FILE **pfplog, nppnodes = -1; } - if ((EI_SD(eIntegrator) || eIntegrator == eiBD) && nppnodes > 0) - { - /* Correct the RNG state size for the number of PP nodes. - * Such assignments should all be moved to one central function. - */ - state->nrng = nppnodes*gmx_rng_n(); - state->nrngi = nppnodes; - } - - *bReadRNG = TRUE; if (fflags != state->flags) { @@ -2037,26 +2010,13 @@ static void read_checkpoint(const char *fn, FILE **pfplog, } else { - if ((EI_SD(eIntegrator) || eIntegrator == eiBD) && - nppnodes != nppnodes_f) - { - *bReadRNG = FALSE; - if (MASTER(cr)) - { - fprintf(stderr, sd_note, nppnodes_f, nppnodes); - } - if (fplog) - { - fprintf(fplog, sd_note, nppnodes_f, nppnodes); - } - } if (MASTER(cr)) { check_match(fplog, version, btime, buser, bhost, double_prec, fprog, cr, nppnodes_f, npmenodes_f, dd_nc, dd_nc_f); } } - ret = do_cpt_state(gmx_fio_getxdr(fp), TRUE, fflags, state, *bReadRNG, NULL); + ret = do_cpt_state(gmx_fio_getxdr(fp), TRUE, fflags, state, NULL); *init_fep_state = state->fep_state; /* there should be a better way to do this than setting it here. Investigate for 5.0. */ if (ret) @@ -2279,7 +2239,7 @@ static void read_checkpoint(const char *fn, FILE **pfplog, void load_checkpoint(const char *fn, FILE **fplog, t_commrec *cr, ivec dd_nc, t_inputrec *ir, t_state *state, - gmx_bool *bReadRNG, gmx_bool *bReadEkin, + gmx_bool *bReadEkin, gmx_bool bAppend, gmx_bool bForceAppend) { gmx_int64_t step; @@ -2290,7 +2250,7 @@ void load_checkpoint(const char *fn, FILE **fplog, /* Read the state from the checkpoint file */ read_checkpoint(fn, fplog, cr, dd_nc, - ir->eI, &(ir->fepvals->init_fep_state), &step, &t, state, bReadRNG, bReadEkin, + ir->eI, &(ir->fepvals->init_fep_state), &step, &t, state, bReadEkin, &ir->simulation_part, bAppend, bForceAppend); } if (PAR(cr)) @@ -2298,7 +2258,6 @@ void load_checkpoint(const char *fn, FILE **fplog, gmx_bcast(sizeof(cr->npmenodes), &cr->npmenodes, cr); gmx_bcast(DIM*sizeof(dd_nc[0]), dd_nc, cr); gmx_bcast(sizeof(step), &step, cr); - gmx_bcast(sizeof(*bReadRNG), bReadRNG, cr); gmx_bcast(sizeof(*bReadEkin), bReadEkin, cr); } ir->bContinuation = TRUE; @@ -2312,7 +2271,6 @@ void load_checkpoint(const char *fn, FILE **fplog, static void read_checkpoint_data(t_fileio *fp, int *simulation_part, gmx_int64_t *step, double *t, t_state *state, - gmx_bool bReadRNG, int *nfiles, gmx_file_position_t **outputfiles) { int file_version; @@ -2333,7 +2291,7 @@ static void read_checkpoint_data(t_fileio *fp, int *simulation_part, &(state->dfhist.nlambda), &state->flags, &flags_eks, &flags_enh, &flags_dfh, &state->edsamstate.nED, &state->swapstate.eSwapCoords, NULL); ret = - do_cpt_state(gmx_fio_getxdr(fp), TRUE, state->flags, state, bReadRNG, NULL); + do_cpt_state(gmx_fio_getxdr(fp), TRUE, state->flags, state, NULL); if (ret) { cp_error(); @@ -2401,7 +2359,7 @@ read_checkpoint_state(const char *fn, int *simulation_part, t_fileio *fp; fp = gmx_fio_open(fn, "r"); - read_checkpoint_data(fp, simulation_part, step, t, state, FALSE, NULL, NULL); + read_checkpoint_data(fp, simulation_part, step, t, state, NULL, NULL); if (gmx_fio_close(fp) != 0) { gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?"); @@ -2421,7 +2379,7 @@ void read_checkpoint_trxframe(t_fileio *fp, t_trxframe *fr) init_state(&state, 0, 0, 0, 0, 0); - read_checkpoint_data(fp, &simulation_part, &step, &t, &state, FALSE, NULL, NULL); + read_checkpoint_data(fp, &simulation_part, &step, &t, &state, NULL, NULL); fr->natoms = state.natoms; fr->bTitle = FALSE; @@ -2483,7 +2441,7 @@ void list_checkpoint(const char *fn, FILE *out) &(state.dfhist.nlambda), &state.flags, &flags_eks, &flags_enh, &flags_dfh, &state.edsamstate.nED, &state.swapstate.eSwapCoords, out); - ret = do_cpt_state(gmx_fio_getxdr(fp), TRUE, state.flags, &state, TRUE, out); + ret = do_cpt_state(gmx_fio_getxdr(fp), TRUE, state.flags, &state, out); if (ret) { cp_error(); @@ -2585,7 +2543,7 @@ gmx_bool read_checkpoint_simulation_part(const char *filename, int *simulation_p { init_state(&state, 0, 0, 0, 0, 0); - read_checkpoint_data(fp, simulation_part, &step, &t, &state, FALSE, + read_checkpoint_data(fp, simulation_part, &step, &t, &state, &nfiles, &outputfiles); if (gmx_fio_close(fp) != 0) { diff --git a/src/gromacs/gmxlib/mvdata.c b/src/gromacs/gmxlib/mvdata.c index b7e6ebfe14..1e491b25aa 100644 --- a/src/gromacs/gmxlib/mvdata.c +++ b/src/gromacs/gmxlib/mvdata.c @@ -257,8 +257,6 @@ void bcast_state(const t_commrec *cr, t_state *state) block_bc(cr, state->ngtc); block_bc(cr, state->nnhpres); block_bc(cr, state->nhchainlength); - block_bc(cr, state->nrng); - block_bc(cr, state->nrngi); block_bc(cr, state->flags); if (state->lambda == NULL) { @@ -309,16 +307,6 @@ void bcast_state(const t_commrec *cr, t_state *state) case estV: nblock_abc(cr, state->natoms, state->v); break; case estSDX: nblock_abc(cr, state->natoms, state->sd_X); break; case estCGP: nblock_abc(cr, state->natoms, state->cg_p); break; - case estLD_RNG: if (state->nrngi == 1) - { - nblock_abc(cr, state->nrng, state->ld_rng); - } - break; - case estLD_RNGI: if (state->nrngi == 1) - { - nblock_abc(cr, state->nrngi, state->ld_rngi); - } - break; case estDISRE_INITF: block_bc(cr, state->hist.disre_initf); break; case estDISRE_RM3TAV: block_bc(cr, state->hist.ndisrepairs); diff --git a/src/gromacs/gmxlib/typedefs.c b/src/gromacs/gmxlib/typedefs.c index a230767f6e..460b5a3b4e 100644 --- a/src/gromacs/gmxlib/typedefs.c +++ b/src/gromacs/gmxlib/typedefs.c @@ -622,7 +622,6 @@ void init_state(t_state *state, int natoms, int ngtc, int nnhpres, int nhchainle int i; state->natoms = natoms; - state->nrng = 0; state->flags = 0; state->lambda = 0; snew(state->lambda, efptNR); @@ -713,19 +712,6 @@ t_state *serial_init_local_state(t_state *state_global) { state_local->lambda[i] = state_global->lambda[i]; } - if (state_global->nrngi > 1) - { - /* With stochastic dynamics we need local storage for the random state */ - if (state_local->flags & (1<nrng = gmx_rng_n(); - snew(state_local->ld_rng, state_local->nrng); - } - if (state_local->flags & (1<ld_rngi, 1); - } - } return state_local; } diff --git a/src/gromacs/legacyheaders/checkpoint.h b/src/gromacs/legacyheaders/checkpoint.h index ae9e3a3370..0db48d6527 100644 --- a/src/gromacs/legacyheaders/checkpoint.h +++ b/src/gromacs/legacyheaders/checkpoint.h @@ -73,8 +73,9 @@ void write_checkpoint(const char *fn, gmx_bool bNumberAndKeep, */ void load_checkpoint(const char *fn, FILE **fplog, t_commrec *cr, ivec dd_nc, - t_inputrec *ir, t_state *state, gmx_bool *bReadRNG, - gmx_bool *bReadEkin, gmx_bool bAppend, gmx_bool bForceAppend); + t_inputrec *ir, t_state *state, + gmx_bool *bReadEkin, + gmx_bool bAppend, gmx_bool bForceAppend); /* Read the state from checkpoint file. * Arrays in state that are NULL are allocated. diff --git a/src/gromacs/legacyheaders/mdrun.h b/src/gromacs/legacyheaders/mdrun.h index 09a54b152f..5ac463ef8a 100644 --- a/src/gromacs/legacyheaders/mdrun.h +++ b/src/gromacs/legacyheaders/mdrun.h @@ -67,7 +67,6 @@ extern "C" { #define MD_DDBONDCOMM (1<<11) #define MD_CONFOUT (1<<12) #define MD_REPRODUCIBLE (1<<13) -#define MD_READ_RNG (1<<14) #define MD_APPENDFILES (1<<15) #define MD_APPENDFILESSET (1<<21) #define MD_KEEPANDNUMCPT (1<<16) @@ -145,29 +144,24 @@ gmx_integrator_t do_tpi; void init_npt_masses(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bInit); -void init_expanded_ensemble(gmx_bool bStateFromCP, t_inputrec *ir, gmx_rng_t *mcrng, df_history_t *dfhist); +void init_expanded_ensemble(gmx_bool bStateFromCP, t_inputrec *ir, df_history_t *dfhist); int ExpandedEnsembleDynamics(FILE *log, t_inputrec *ir, gmx_enerdata_t *enerd, t_state *state, t_extmass *MassQ, int fep_state, df_history_t *dfhist, - gmx_int64_t step, gmx_rng_t mcrng, + gmx_int64_t step, rvec *v, t_mdatoms *mdatoms); void PrintFreeEnergyInfoToFile(FILE *outfile, t_lambda *fep, t_expanded *expand, t_simtemp *simtemp, df_history_t *dfhist, int fep_state, int frequency, gmx_int64_t step); -void get_mc_state(gmx_rng_t rng, t_state *state); - -void set_mc_state(gmx_rng_t rng, t_state *state); - /* check the version */ void check_ir_old_tpx_versions(t_commrec *cr, FILE *fplog, t_inputrec *ir, gmx_mtop_t *mtop); /* Allocate and initialize node-local state entries. */ -void set_state_entries(t_state *state, const t_inputrec *ir, int nnodes); +void set_state_entries(t_state *state, const t_inputrec *ir); -/* Broadcast the data for a simulation, and allocate node-specific settings - such as rng generators. */ +/* Broadcast the data for a simulation, and allocate node-specific settings */ void init_parallel(t_commrec *cr, t_inputrec *inputrec, gmx_mtop_t *mtop); diff --git a/src/gromacs/legacyheaders/types/state.h b/src/gromacs/legacyheaders/types/state.h index 4acee9803b..100dce0736 100644 --- a/src/gromacs/legacyheaders/types/state.h +++ b/src/gromacs/legacyheaders/types/state.h @@ -217,9 +217,7 @@ typedef struct int natoms; int ngtc; int nnhpres; - int nhchainlength; /* number of nose-hoover chains */ - int nrng; - int nrngi; + int nhchainlength; /* number of nose-hoover chains */ int flags; /* Flags telling which entries are present */ int fep_state; /* indicates which of the alchemical states we are in */ real *lambda; /* lambda vector */ @@ -242,13 +240,6 @@ typedef struct rvec *sd_X; /* random part of the x update for stoch. dyn. */ rvec *cg_p; /* p vector for conjugate gradient minimization */ - unsigned int *ld_rng; /* RNG random state */ - int *ld_rngi; /* RNG index */ - - int nmcrng; /* number of RNG states */ - unsigned int *mc_rng; /* lambda MC RNG random state */ - int *mc_rngi; /* lambda MC RNG index */ - history_t hist; /* Time history for restraints */ ekinstate_t ekinstate; /* The state of the kinetic energy data */ diff --git a/src/gromacs/legacyheaders/update.h b/src/gromacs/legacyheaders/update.h index 5ccdd64eba..398e505af5 100644 --- a/src/gromacs/legacyheaders/update.h +++ b/src/gromacs/legacyheaders/update.h @@ -42,7 +42,6 @@ #include "mshift.h" #include "tgroup.h" #include "network.h" -#include "gromacs/random/random.h" #include "vec.h" @@ -72,7 +71,6 @@ void update_tcouple(gmx_int64_t step, t_inputrec *inputrec, t_state *state, gmx_ekindata_t *ekind, - gmx_update_t upd, t_extmass *MassQ, t_mdatoms *md ); @@ -107,7 +105,7 @@ void update_coords(FILE *fplog, /* Return TRUE if OK, FALSE in case of Shake Error */ -extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_int64_t step, t_mdatoms *md, t_state *state, gmx_update_t upd, t_idef *idef, gmx_constr_t constr); +extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_int64_t step, const t_commrec *cr, t_mdatoms *md, t_state *state, gmx_update_t upd, gmx_constr_t constr); void update_constraints(FILE *fplog, gmx_int64_t step, @@ -175,7 +173,8 @@ restore_ekinstate_from_state(t_commrec *cr, void berendsen_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt); -void andersen_tcoupl(t_inputrec *ir, t_mdatoms *md, t_state *state, gmx_rng_t rng, real rate, t_idef *idef, int nblocks, int *sblock, gmx_bool *randatom, int *randatom_list, gmx_bool *randomize, real *boltzfac); +void andersen_tcoupl(t_inputrec *ir, gmx_int64_t step, + const t_commrec *cr, const t_mdatoms *md, t_state *state, real rate, const gmx_bool *randomize, const real *boltzfac); void nosehoover_tcoupl(t_grpopts *opts, gmx_ekindata_t *ekind, real dt, double xi[], double vxi[], t_extmass *MassQ); @@ -196,9 +195,9 @@ real NPT_energy(t_inputrec *ir, t_state *state, t_extmass *MassQ); void NBaroT_trotter(t_grpopts *opts, real dt, double xi[], double vxi[], real *veta, t_extmass *MassQ); -void vrescale_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt, - double therm_integral[], - gmx_rng_t rng); +void vrescale_tcoupl(t_inputrec *ir, gmx_int64_t step, + gmx_ekindata_t *ekind, real dt, + double therm_integral[]); /* Compute temperature scaling. For V-rescale it is done in update. */ real vrescale_energy(t_grpopts *opts, double therm_integral[]); diff --git a/src/gromacs/mdlib/coupling.c b/src/gromacs/mdlib/coupling.c index 19002e5ea1..7d207b9d50 100644 --- a/src/gromacs/mdlib/coupling.c +++ b/src/gromacs/mdlib/coupling.c @@ -722,203 +722,50 @@ void berendsen_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt) } } -static int poisson_variate(real lambda, gmx_rng_t rng) +void andersen_tcoupl(t_inputrec *ir, gmx_int64_t step, + const t_commrec *cr, const t_mdatoms *md, t_state *state, real rate, const gmx_bool *randomize, const real *boltzfac) { + const int *gatindex = (DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL); + int i; + int gc = 0; - real L; - int k = 0; - real p = 1.0; - - L = exp(-lambda); - - do - { - k = k+1; - p *= gmx_rng_uniform_real(rng); - } - while (p > L); - - return k-1; -} - -void andersen_tcoupl(t_inputrec *ir, t_mdatoms *md, t_state *state, gmx_rng_t rng, real rate, t_idef *idef, int nblocks, int *sblock, gmx_bool *randatom, int *randatom_list, gmx_bool *randomize, real *boltzfac) -{ - t_grpopts *opts; - int i, j, k, d, len, n, ngtc, gc = 0; - int nshake, nsettle, nrandom, nrand_group; - real boltz, scal, reft, prand; - t_iatom *iatoms; - - /* convenience variables */ - opts = &ir->opts; - ngtc = opts->ngtc; + /* randomize the velocities of the selected particles */ - /* idef is only passed in if it's chance-per-particle andersen, so - it essentially serves as a boolean to determine which type of - andersen is being used */ - if (idef) + for (i = 0; i < md->homenr; i++) /* now loop over the list of atoms */ { + int ng = gatindex ? gatindex[i] : i; + gmx_bool bRandomize; - /* randomly atoms to randomize. However, all constraint - groups have to have either all of the atoms or none of the - atoms randomize. - - Algorithm: - 1. Select whether or not to randomize each atom to get the correct probability. - 2. Cycle through the constraint groups. - 2a. for each constraint group, determine the fraction f of that constraint group that are - chosen to be randomized. - 2b. all atoms in the constraint group are randomized with probability f. - */ - - nrandom = 0; - if ((rate < 0.05) && (md->homenr > 50)) - { - /* if the rate is relatively high, use a standard method, if low rate, - * use poisson */ - /* poisson distributions approxmation, more efficient for - * low rates, fewer random numbers required */ - nrandom = poisson_variate(md->homenr*rate, rng); /* how many do we randomize? Use Poisson. */ - /* now we know how many, choose them randomly. No worries about repeats, at this rate, it's negligible. - worst thing that happens, it lowers the true rate an negligible amount */ - for (i = 0; i < nrandom; i++) - { - randatom[(int)(gmx_rng_uniform_real(rng)*md->homenr)] = TRUE; - } - } - else + if (md->cTC) { - for (i = 0; i < md->homenr; i++) - { - if (gmx_rng_uniform_real(rng) < rate) - { - randatom[i] = TRUE; - nrandom++; - } - } + gc = md->cTC[i]; /* assign the atom to a temperature group if there are more than one */ } - - /* instead of looping over the constraint groups, if we had a - list of which atoms were in which constraint groups, we - could then loop over only the groups that are randomized - now. But that is not available now. Create later after - determining whether there actually is any slowing. */ - - /* first, loop through the settles to make sure all groups either entirely randomized, or not randomized. */ - - nsettle = idef->il[F_SETTLE].nr/2; - for (i = 0; i < nsettle; i++) + if (randomize[gc]) { - iatoms = idef->il[F_SETTLE].iatoms; - nrand_group = 0; - for (k = 0; k < 3; k++) /* settles are always 3 atoms, hardcoded */ + if (ir->etc == etcANDERSEN) { - if (randatom[iatoms[2*i+1]+k]) - { - nrand_group++; /* count the number of atoms to be shaken in the settles group */ - randatom[iatoms[2*i+1]+k] = FALSE; - nrandom--; - } + bRandomize = TRUE; } - if (nrand_group > 0) + else { - prand = (nrand_group)/3.0; /* use this fraction to compute the probability the - whole group is randomized */ - if (gmx_rng_uniform_real(rng) < prand) - { - for (k = 0; k < 3; k++) - { - randatom[iatoms[2*i+1]+k] = TRUE; /* mark them all to be randomized */ - } - nrandom += 3; - } - } - } + double uniform[2]; - /* now loop through the shake groups */ - nshake = nblocks; - for (i = 0; i < nshake; i++) - { - iatoms = &(idef->il[F_CONSTR].iatoms[sblock[i]]); - len = sblock[i+1]-sblock[i]; - nrand_group = 0; - for (k = 0; k < len; k++) - { - if (k%3 != 0) - { /* only 2/3 of the sblock items are atoms, the others are labels */ - if (randatom[iatoms[k]]) - { - nrand_group++; - randatom[iatoms[k]] = FALSE; /* need to mark it false here in case the atom is in more than - one group in the shake block */ - nrandom--; - } - } - } - if (nrand_group > 0) - { - prand = (nrand_group)/(1.0*(2*len/3)); - if (gmx_rng_uniform_real(rng) < prand) - { - for (k = 0; k < len; k++) - { - if (k%3 != 0) - { /* only 2/3 of the sblock items are atoms, the others are labels */ - randatom[iatoms[k]] = TRUE; /* randomize all of them */ - nrandom++; - } - } - } + gmx_rng_cycle_2uniform(step*2, ng, ir->andersen_seed, RND_SEED_ANDERSEN, uniform); + bRandomize = (uniform[0] < rate); } - } - if (nrandom > 0) - { - n = 0; - for (i = 0; i < md->homenr; i++) /* now loop over the list of atoms */ + if (bRandomize) { - if (randatom[i]) + real scal, gauss[3]; + int d; + + scal = sqrt(boltzfac[gc]*md->invmass[i]); + gmx_rng_cycle_3gaussian_table(step*2+1, ng, ir->andersen_seed, RND_SEED_ANDERSEN, gauss); + for (d = 0; d < DIM; d++) { - randatom_list[n] = i; - n++; + state->v[i][d] = scal*gauss[d]; } } - nrandom = n; /* there are some values of nrandom for which - this algorithm won't work; for example all - water molecules and nrandom =/= 3. Better to - recount and use this number (which we - calculate anyway: it will not affect - the average number of atoms accepted. - */ - } - } - else - { - /* if it's andersen-massive, then randomize all the atoms */ - nrandom = md->homenr; - for (i = 0; i < nrandom; i++) - { - randatom_list[i] = i; - } - } - - /* randomize the velocities of the selected particles */ - - for (i = 0; i < nrandom; i++) /* now loop over the list of atoms */ - { - n = randatom_list[i]; - if (md->cTC) - { - gc = md->cTC[n]; /* assign the atom to a temperature group if there are more than one */ - } - if (randomize[gc]) - { - scal = sqrt(boltzfac[gc]*md->invmass[n]); - for (d = 0; d < DIM; d++) - { - state->v[n][d] = scal*gmx_rng_gaussian_table(rng); - } } - randatom[n] = FALSE; /* unmark this atom for randomization */ } } @@ -1507,11 +1354,14 @@ real NPT_energy(t_inputrec *ir, t_state *state, t_extmass *MassQ) return ener_npt; } -static real vrescale_gamdev(int ia, gmx_rng_t rng) +static real vrescale_gamdev(int ia, + gmx_int64_t step, gmx_int64_t *count, + gmx_int64_t seed1, gmx_int64_t seed2) /* Gamma distribution, adapted from numerical recipes */ { - int j; - real am, e, s, v1, v2, x, y; + int j; + real am, e, s, v1, v2, x, y; + double rnd[2]; if (ia < 6) { @@ -1520,7 +1370,8 @@ static real vrescale_gamdev(int ia, gmx_rng_t rng) x = 1.0; for (j = 1; j <= ia; j++) { - x *= gmx_rng_uniform_real(rng); + gmx_rng_cycle_2uniform(step, (*count)++, seed1, seed2, rnd); + x *= rnd[0]; } } while (x == 0); @@ -1534,8 +1385,9 @@ static real vrescale_gamdev(int ia, gmx_rng_t rng) { do { - v1 = gmx_rng_uniform_real(rng); - v2 = 2.0*gmx_rng_uniform_real(rng)-1.0; + gmx_rng_cycle_2uniform(step, (*count)++, seed1, seed2, rnd); + v1 = rnd[0]; + v2 = 2.0*rnd[1] - 1.0; } while (v1*v1 + v2*v2 > 1.0 || v1*v1*GMX_REAL_MAX < 3.0*ia); @@ -1549,44 +1401,59 @@ static real vrescale_gamdev(int ia, gmx_rng_t rng) } while (x <= 0.0); e = (1.0 + y*y)*exp(am*log(x/am) - s*y); + + gmx_rng_cycle_2uniform(step, (*count)++, seed1, seed2, rnd); } - while (gmx_rng_uniform_real(rng) > e); + while (rnd[0] > e); } return x; } -static real vrescale_sumnoises(int nn, gmx_rng_t rng) +static real gaussian_count(gmx_int64_t step, gmx_int64_t *count, + gmx_int64_t seed1, gmx_int64_t seed2) +{ + double rnd[2], x, y, r; + + do + { + gmx_rng_cycle_2uniform(step, (*count)++, seed1, seed2, rnd); + x = 2.0*rnd[0] - 1.0; + y = 2.0*rnd[1] - 1.0; + r = x*x + y*y; + } + while (r > 1.0 || r == 0.0); + + r = sqrt(-2.0*log(r)/r); + + return x*r; +} + +static real vrescale_sumnoises(int nn, + gmx_int64_t step, gmx_int64_t *count, + gmx_int64_t seed1, gmx_int64_t seed2) { /* * Returns the sum of n independent gaussian noises squared * (i.e. equivalent to summing the square of the return values * of nn calls to gmx_rng_gaussian_real).xs */ - real rr; + real r, gauss; - if (nn == 0) - { - return 0.0; - } - else if (nn == 1) - { - rr = gmx_rng_gaussian_real(rng); - return rr*rr; - } - else if (nn % 2 == 0) - { - return 2.0*vrescale_gamdev(nn/2, rng); - } - else + r = 2.0*vrescale_gamdev(nn/2, step, count, seed1, seed2); + + if (nn % 2 == 1) { - rr = gmx_rng_gaussian_real(rng); - return 2.0*vrescale_gamdev((nn-1)/2, rng) + rr*rr; + gauss = gaussian_count(step, count, seed1, seed2); + + r += gauss*gauss; } + + return r; } static real vrescale_resamplekin(real kk, real sigma, int ndeg, real taut, - gmx_rng_t rng) + gmx_int64_t step, gmx_int64_t seed) { /* * Generates a new value for the kinetic energy, @@ -1596,7 +1463,11 @@ static real vrescale_resamplekin(real kk, real sigma, int ndeg, real taut, * ndeg: number of degrees of freedom of the atoms to be thermalized * taut: relaxation time of the thermostat, in units of 'how often this routine is called' */ - real factor, rr; + /* rnd_count tracks the step-local state for the cycle random + * number generator. + */ + gmx_int64_t rnd_count = 0; + real factor, rr, ekin_new; if (taut > 0.1) { @@ -1606,15 +1477,20 @@ static real vrescale_resamplekin(real kk, real sigma, int ndeg, real taut, { factor = 0.0; } - rr = gmx_rng_gaussian_real(rng); - return + + rr = gaussian_count(step, &rnd_count, seed, RND_SEED_VRESCALE); + + ekin_new = kk + - (1.0 - factor)*(sigma*(vrescale_sumnoises(ndeg-1, rng) + rr*rr)/ndeg - kk) + + (1.0 - factor)*(sigma*(vrescale_sumnoises(ndeg-1, step, &rnd_count, seed, RND_SEED_VRESCALE) + rr*rr)/ndeg - kk) + 2.0*rr*sqrt(kk*sigma/ndeg*(1.0 - factor)*factor); + + return ekin_new; } -void vrescale_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt, - double therm_integral[], gmx_rng_t rng) +void vrescale_tcoupl(t_inputrec *ir, gmx_int64_t step, + gmx_ekindata_t *ekind, real dt, + double therm_integral[]) { t_grpopts *opts; int i; @@ -1639,7 +1515,8 @@ void vrescale_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt, Ek_ref = Ek_ref1*opts->nrdf[i]; Ek_new = vrescale_resamplekin(Ek, Ek_ref, opts->nrdf[i], - opts->tau_t[i]/dt, rng); + opts->tau_t[i]/dt, + ir->ld_seed, step); /* Analytically Ek_new>=0, but we check for rounding errors */ if (Ek_new <= 0) diff --git a/src/gromacs/mdlib/domdec.c b/src/gromacs/mdlib/domdec.c index 864c054358..a344ba5bd7 100644 --- a/src/gromacs/mdlib/domdec.c +++ b/src/gromacs/mdlib/domdec.c @@ -1586,37 +1586,6 @@ void dd_collect_state(gmx_domdec_t *dd, case estCGP: dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p); break; - case estLD_RNG: - if (state->nrngi == 1) - { - if (DDMASTER(dd)) - { - for (i = 0; i < state_local->nrng; i++) - { - state->ld_rng[i] = state_local->ld_rng[i]; - } - } - } - else - { - dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]), - state_local->ld_rng, state->ld_rng); - } - break; - case estLD_RNGI: - if (state->nrngi == 1) - { - if (DDMASTER(dd)) - { - state->ld_rngi[0] = state_local->ld_rngi[0]; - } - } - else - { - dd_gather(dd, sizeof(state->ld_rngi[0]), - state_local->ld_rngi, state->ld_rngi); - } - break; case estDISRE_INITF: case estDISRE_RM3TAV: case estORIRE_INITF: @@ -1658,8 +1627,6 @@ static void dd_realloc_state(t_state *state, rvec **f, int nalloc) case estCGP: srenew(state->cg_p, state->nalloc); break; - case estLD_RNG: - case estLD_RNGI: case estDISRE_INITF: case estDISRE_RM3TAV: case estORIRE_INITF: @@ -1917,32 +1884,6 @@ static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs, case estCGP: dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p); break; - case estLD_RNG: - if (state->nrngi == 1) - { - dd_bcastc(dd, - state_local->nrng*sizeof(state_local->ld_rng[0]), - state->ld_rng, state_local->ld_rng); - } - else - { - dd_scatter(dd, - state_local->nrng*sizeof(state_local->ld_rng[0]), - state->ld_rng, state_local->ld_rng); - } - break; - case estLD_RNGI: - if (state->nrngi == 1) - { - dd_bcastc(dd, sizeof(state_local->ld_rngi[0]), - state->ld_rngi, state_local->ld_rngi); - } - else - { - dd_scatter(dd, sizeof(state_local->ld_rngi[0]), - state->ld_rngi, state_local->ld_rngi); - } - break; case estDISRE_INITF: case estDISRE_RM3TAV: case estORIRE_INITF: diff --git a/src/gromacs/mdlib/domdec_top.c b/src/gromacs/mdlib/domdec_top.c index 06ac1876e8..059528533b 100644 --- a/src/gromacs/mdlib/domdec_top.c +++ b/src/gromacs/mdlib/domdec_top.c @@ -47,7 +47,6 @@ #include "vec.h" #include "pbc.h" #include "chargegroup.h" -#include "gromacs/random/random.h" #include "gromacs/gmxlib/topsort.h" #include "mtop_util.h" #include "mshift.h" @@ -1959,29 +1958,6 @@ void dd_init_local_state(gmx_domdec_t *dd, init_state(state_local, 0, buf[1], buf[2], buf[3], buf[4]); state_local->flags = buf[0]; - - /* With Langevin Dynamics we need to make proper storage space - * in the global and local state for the random numbers. - */ - if (state_local->flags & (1<nrngi > 1) - { - state_global->nrng = dd->nnodes*gmx_rng_n(); - srenew(state_global->ld_rng, state_global->nrng); - } - state_local->nrng = gmx_rng_n(); - snew(state_local->ld_rng, state_local->nrng); - } - if (state_local->flags & (1<nrngi > 1) - { - state_global->nrngi = dd->nnodes; - srenew(state_global->ld_rngi, state_global->nrngi); - } - snew(state_local->ld_rngi, 1); - } } static void check_link(t_blocka *link, int cg_gl, int cg_gl_j) diff --git a/src/gromacs/mdlib/expanded.c b/src/gromacs/mdlib/expanded.c index ce2aea3003..283dbbbf0c 100644 --- a/src/gromacs/mdlib/expanded.c +++ b/src/gromacs/mdlib/expanded.c @@ -87,9 +87,8 @@ static void init_df_history_weights(df_history_t *dfhist, t_expanded *expand, in /* Eventually should contain all the functions needed to initialize expanded ensemble before the md loop starts */ -extern void init_expanded_ensemble(gmx_bool bStateFromCP, t_inputrec *ir, gmx_rng_t *mcrng, df_history_t *dfhist) +extern void init_expanded_ensemble(gmx_bool bStateFromCP, t_inputrec *ir, df_history_t *dfhist) { - *mcrng = gmx_rng_init(ir->expandedvals->lmc_seed); if (!bStateFromCP) { init_df_history_weights(dfhist, ir->expandedvals, ir->fepvals->n_lambda); @@ -736,7 +735,8 @@ static gmx_bool UpdateWeights(int nlim, t_expanded *expand, df_history_t *dfhist return FALSE; } -static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, int fep_state, real *weighted_lamee, double *p_k, gmx_rng_t rng) +static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, int fep_state, real *weighted_lamee, double *p_k, + gmx_int64_t seed, gmx_int64_t step) { /* Choose new lambda value, and update transition matrix */ @@ -782,6 +782,9 @@ static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, i for (i = 0; i < expand->lmc_repeats; i++) { + double rnd[2]; + + gmx_rng_cycle_2uniform(step, i, seed, RND_SEED_EXPANDED, rnd); for (ifep = 0; ifep < nlim; ifep++) { @@ -823,7 +826,7 @@ static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, i accept[ifep] = 1.0; } /* Gibbs sampling */ - r1 = gmx_rng_uniform_real(rng); + r1 = rnd[0]; for (lamnew = minfep; lamnew <= maxfep; lamnew++) { if (r1 <= p_k[lamnew]) @@ -864,7 +867,7 @@ static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, i } } - r1 = gmx_rng_uniform_real(rng); + r1 = rnd[0]; for (lamtrial = minfep; lamtrial <= maxfep; lamtrial++) { pnorm = p_k[lamtrial]/remainder[fep_state]; @@ -886,7 +889,7 @@ static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, i { tprob = trialprob; } - r2 = gmx_rng_uniform_real(rng); + r2 = rnd[1]; if (r2 < tprob) { lamnew = lamtrial; @@ -947,7 +950,7 @@ static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, i else if ((expand->elmcmove == elmcmoveMETROPOLIS) || (expand->elmcmove == elmcmoveBARKER)) { /* use the metropolis sampler with trial +/- 1 */ - r1 = gmx_rng_uniform_real(rng); + r1 = rnd[0]; if (r1 < 0.5) { if (fep_state == 0) @@ -996,7 +999,7 @@ static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, i accept[lamtrial] = 1.0; } - r2 = gmx_rng_uniform_real(rng); + r2 = rnd[1]; if (r2 < tprob) { lamnew = lamtrial; @@ -1196,19 +1199,9 @@ extern void PrintFreeEnergyInfoToFile(FILE *outfile, t_lambda *fep, t_expanded * } } -extern void get_mc_state(gmx_rng_t rng, t_state *state) -{ - gmx_rng_get_state(rng, state->mc_rng, state->mc_rngi); -} - -extern void set_mc_state(gmx_rng_t rng, t_state *state) -{ - gmx_rng_set_state(rng, state->mc_rng, state->mc_rngi[0]); -} - extern int ExpandedEnsembleDynamics(FILE *log, t_inputrec *ir, gmx_enerdata_t *enerd, t_state *state, t_extmass *MassQ, int fep_state, df_history_t *dfhist, - gmx_int64_t step, gmx_rng_t mcrng, + gmx_int64_t step, rvec *v, t_mdatoms *mdatoms) /* Note that the state variable is only needed for simulated tempering, not Hamiltonian expanded ensemble. May be able to remove it after integrator refactoring. */ @@ -1321,7 +1314,8 @@ extern int ExpandedEnsembleDynamics(FILE *log, t_inputrec *ir, gmx_enerdata_t *e } } - lamnew = ChooseNewLambda(nlim, expand, dfhist, fep_state, weighted_lamee, p_k, mcrng); + lamnew = ChooseNewLambda(nlim, expand, dfhist, fep_state, weighted_lamee, p_k, + ir->expandedvals->lmc_seed, step); /* if using simulated tempering, we need to adjust the temperatures */ if (ir->bSimTemp && (lamnew != fep_state)) /* only need to change the temperatures if we change the state */ { diff --git a/src/gromacs/mdlib/init.c b/src/gromacs/mdlib/init.c index 99105bd6b8..38bf4401f0 100644 --- a/src/gromacs/mdlib/init.c +++ b/src/gromacs/mdlib/init.c @@ -52,7 +52,6 @@ #include "mdrun.h" #include "names.h" #include "calcgrid.h" -#include "gromacs/random/random.h" #include "update.h" #include "mdebin.h" @@ -61,7 +60,7 @@ #define NOT_FINISHED(l1, l2) \ printf("not finished yet: lines %d .. %d in %s\n", l1, l2, __FILE__) -void set_state_entries(t_state *state, const t_inputrec *ir, int nnodes) +void set_state_entries(t_state *state, const t_inputrec *ir) { int nnhpres; @@ -109,31 +108,6 @@ void set_state_entries(t_state *state, const t_inputrec *ir, int nnodes) snew(state->cg_p, state->nalloc); } } - if (EI_SD(ir->eI) || ir->eI == eiBD || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc)) - { - state->nrng = gmx_rng_n(); - state->nrngi = 1; - if (EI_SD(ir->eI) || ir->eI == eiBD || ETC_ANDERSEN(ir->etc)) - { - /* This will be correct later with DD */ - state->nrng *= nnodes; - state->nrngi *= nnodes; - } - state->flags |= ((1<ld_rng, state->nrng); - snew(state->ld_rngi, state->nrngi); - } - else - { - state->nrng = 0; - } - - if (ir->bExpanded) - { - state->nmcrng = gmx_rng_n(); - snew(state->mc_rng, state->nmcrng); - snew(state->mc_rngi, 1); - } state->nnhpres = 0; if (ir->ePBC != epbcNONE) @@ -190,10 +164,4 @@ void init_parallel(t_commrec *cr, t_inputrec *inputrec, gmx_mtop_t *mtop) { bcast_ir_mtop(cr, inputrec, mtop); - - if (inputrec->eI == eiBD || EI_SD(inputrec->eI) || ETC_ANDERSEN(inputrec->etc)) - { - /* Make sure the random seeds are different on each node */ - inputrec->ld_seed += cr->nodeid; - } } diff --git a/src/gromacs/mdlib/minimize.c b/src/gromacs/mdlib/minimize.c index 56aead1d94..26122e26d4 100644 --- a/src/gromacs/mdlib/minimize.c +++ b/src/gromacs/mdlib/minimize.c @@ -49,7 +49,6 @@ #include "main.h" #include "force.h" #include "macros.h" -#include "gromacs/random/random.h" #include "names.h" #include "gmx_fatal.h" #include "txtdump.h" diff --git a/src/gromacs/mdlib/sim_util.c b/src/gromacs/mdlib/sim_util.c index df0e1922c8..e154f2eba2 100644 --- a/src/gromacs/mdlib/sim_util.c +++ b/src/gromacs/mdlib/sim_util.c @@ -69,7 +69,6 @@ #include "constr.h" #include "xvgr.h" #include "copyrite.h" -#include "gromacs/random/random.h" #include "domdec.h" #include "genborn.h" #include "nbnxn_atomdata.h" diff --git a/src/gromacs/mdlib/tpi.c b/src/gromacs/mdlib/tpi.c index 3f944c9e4c..3d6d5a0894 100644 --- a/src/gromacs/mdlib/tpi.c +++ b/src/gromacs/mdlib/tpi.c @@ -142,14 +142,18 @@ double do_tpi(FILE *fplog, t_commrec *cr, double embU, sum_embU, *sum_UgembU, V, V_all, VembU_all; t_trxstatus *status; t_trxframe rerun_fr; - gmx_bool bDispCorr, bCharge, bRFExcl, bNotLastFrame, bStateChanged, bNS, bOurStep; + gmx_bool bDispCorr, bCharge, bRFExcl, bNotLastFrame, bStateChanged, bNS; tensor force_vir, shake_vir, vir, pres; int cg_tp, a_tp0, a_tp1, ngid, gid_tp, nener, e; rvec *x_mol; rvec mu_tot, x_init, dx, x_tp; - int nnodes, frame, nsteps, step; + int nnodes, frame; + gmx_int64_t frame_step_prev, frame_step; + gmx_int64_t nsteps, stepblocksize = 0, step; + gmx_int64_t rnd_count_stride, rnd_count; + gmx_int64_t seed; + double rnd[4]; int i, start, end; - gmx_rng_t tpi_rand; FILE *fp_tpi = NULL; char *ptr, *dump_pdb, **leg, str[STRLEN], str2[STRLEN]; double dbl, dump_ener; @@ -310,7 +314,7 @@ double do_tpi(FILE *fplog, t_commrec *cr, a_tp1-a_tp0, bCharge ? "with" : "without"); fprintf(fplog, "\nWill insert %d times in each frame of %s\n", - nsteps, opt2fn("-rerun", nfile, fnm)); + (int)nsteps, opt2fn("-rerun", nfile, fnm)); } if (!bCavity) @@ -356,8 +360,17 @@ double do_tpi(FILE *fplog, t_commrec *cr, } snew(sum_UgembU, nener); - /* Initialize random generator */ - tpi_rand = gmx_rng_init(inputrec->ld_seed); + /* Copy the random seed set by the user */ + seed = inputrec->ld_seed; + /* We use the frame step number as one random counter. + * The second counter use the insertion (step) count. But we + * need multiple random numbers per insertion. This number is + * not fixed, since we generate random locations in a sphere + * by putting locations in a cube and some of these fail. + * A count of 20 is already extremely unlikely, so 10000 is + * a safe margin for random numbers per insertion. + */ + rnd_count_stride = 10000; if (MASTER(cr)) { @@ -422,6 +435,9 @@ double do_tpi(FILE *fplog, t_commrec *cr, nbin = 10; snew(bin, nbin); + /* Avoid frame step numbers <= -1 */ + frame_step_prev = -1; + bNotLastFrame = read_first_frame(oenv, &status, opt2fn("-rerun", nfile, fnm), &rerun_fr, TRX_NEED_X); frame = 0; @@ -438,6 +454,18 @@ double do_tpi(FILE *fplog, t_commrec *cr, refvolshift = log(det(rerun_fr.box)); + switch (inputrec->eI) + { + case eiTPI: + stepblocksize = inputrec->nstlist; + break; + case eiTPIC: + stepblocksize = 1; + break; + default: + gmx_fatal(FARGS, "Unknown integrator %s", ei_names[inputrec->eI]); + } + #ifdef GMX_SIMD_X86_SSE2_OR_HIGHER /* Make sure we don't detect SSE overflow generated before this point */ gmx_mm_check_and_reset_overflow(); @@ -445,6 +473,18 @@ double do_tpi(FILE *fplog, t_commrec *cr, while (bNotLastFrame) { + frame_step = rerun_fr.step; + if (frame_step <= frame_step_prev) + { + /* We don't have step number in the trajectory file, + * or we have constant or decreasing step numbers. + * Ensure we have increasing step numbers, since we use + * the step numbers as a counter for random numbers. + */ + frame_step = frame_step_prev + 1; + } + frame_step_prev = frame_step; + lambda = rerun_fr.lambda; t = rerun_fr.time; @@ -466,11 +506,18 @@ double do_tpi(FILE *fplog, t_commrec *cr, bStateChanged = TRUE; bNS = TRUE; - for (step = 0; step < nsteps; step++) + + step = cr->nodeid*stepblocksize; + while (step < nsteps) { - /* In parallel all nodes generate all random configurations. - * In that way the result is identical to a single cpu tpi run. + /* Initialize the second counter for random numbers using + * the insertion step index. This ensures that we get + * the same random numbers independently of how many + * MPI ranks we use. Also for the same seed, we get + * the same initial random sequence for different nsteps. */ + rnd_count = step*rnd_count_stride; + if (!bCavity) { /* Random insertion in the whole volume */ @@ -478,9 +525,12 @@ double do_tpi(FILE *fplog, t_commrec *cr, if (bNS) { /* Generate a random position in the box */ - x_init[XX] = gmx_rng_uniform_real(tpi_rand)*state->box[XX][XX]; - x_init[YY] = gmx_rng_uniform_real(tpi_rand)*state->box[YY][YY]; - x_init[ZZ] = gmx_rng_uniform_real(tpi_rand)*state->box[ZZ][ZZ]; + gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd); + gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2); + for (d = 0; d < DIM; d++) + { + x_init[d] = rnd[d]*state->box[d][d]; + } } if (inputrec->nstlist == 1) { @@ -491,9 +541,12 @@ double do_tpi(FILE *fplog, t_commrec *cr, /* Generate coordinates within |dx|=drmax of x_init */ do { - dx[XX] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax; - dx[YY] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax; - dx[ZZ] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax; + gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd); + gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2); + for (d = 0; d < DIM; d++) + { + dx[d] = (2*rnd[d] - 1)*drmax; + } } while (norm2(dx) > drmax*drmax); rvec_add(x_init, dx, x_tp); @@ -534,9 +587,12 @@ double do_tpi(FILE *fplog, t_commrec *cr, /* Generate coordinates within |dx|=drmax of x_init */ do { - dx[XX] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax; - dx[YY] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax; - dx[ZZ] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax; + gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd); + gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2); + for (d = 0; d < DIM; d++) + { + dx[d] = (2*rnd[d] - 1)*drmax; + } } while (norm2(dx) > drmax*drmax); rvec_add(x_init, dx, x_tp); @@ -555,10 +611,12 @@ double do_tpi(FILE *fplog, t_commrec *cr, copy_rvec(x_mol[i-a_tp0], state->x[i]); } /* Rotate the molecule randomly */ + gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd); + gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2); rotate_conf(a_tp1-a_tp0, state->x+a_tp0, NULL, - 2*M_PI*gmx_rng_uniform_real(tpi_rand), - 2*M_PI*gmx_rng_uniform_real(tpi_rand), - 2*M_PI*gmx_rng_uniform_real(tpi_rand)); + 2*M_PI*rnd[0], + 2*M_PI*rnd[1], + 2*M_PI*rnd[2]); /* Shift to the insertion location */ for (i = a_tp0; i < a_tp1; i++) { @@ -566,188 +624,176 @@ double do_tpi(FILE *fplog, t_commrec *cr, } } - /* Check if this insertion belongs to this node */ - bOurStep = TRUE; - if (PAR(cr)) + /* Clear some matrix variables */ + clear_mat(force_vir); + clear_mat(shake_vir); + clear_mat(vir); + clear_mat(pres); + + /* Set the charge group center of mass of the test particle */ + copy_rvec(x_init, fr->cg_cm[top->cgs.nr-1]); + + /* Calc energy (no forces) on new positions. + * Since we only need the intermolecular energy + * and the RF exclusion terms of the inserted molecule occur + * within a single charge group we can pass NULL for the graph. + * This also avoids shifts that would move charge groups + * out of the box. + * + * Some checks above ensure than we can not have + * twin-range interactions together with nstlist > 1, + * therefore we do not need to remember the LR energies. + */ + /* Make do_force do a single node force calculation */ + cr->nnodes = 1; + do_force(fplog, cr, inputrec, + step, nrnb, wcycle, top, &top_global->groups, + state->box, state->x, &state->hist, + f, force_vir, mdatoms, enerd, fcd, + state->lambda, + NULL, fr, NULL, mu_tot, t, NULL, NULL, FALSE, + GMX_FORCE_NONBONDED | GMX_FORCE_ENERGY | + (bNS ? GMX_FORCE_DYNAMICBOX | GMX_FORCE_NS | GMX_FORCE_DO_LR : 0) | + (bStateChanged ? GMX_FORCE_STATECHANGED : 0)); + cr->nnodes = nnodes; + bStateChanged = FALSE; + bNS = FALSE; + + /* Calculate long range corrections to pressure and energy */ + calc_dispcorr(fplog, inputrec, fr, step, top_global->natoms, state->box, + lambda, pres, vir, &prescorr, &enercorr, &dvdlcorr); + /* figure out how to rearrange the next 4 lines MRS 8/4/2009 */ + enerd->term[F_DISPCORR] = enercorr; + enerd->term[F_EPOT] += enercorr; + enerd->term[F_PRES] += prescorr; + enerd->term[F_DVDL_VDW] += dvdlcorr; + + epot = enerd->term[F_EPOT]; + bEnergyOutOfBounds = FALSE; +#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER + /* With SSE the energy can overflow, check for this */ + if (gmx_mm_check_and_reset_overflow()) { - switch (inputrec->eI) + if (debug) { - case eiTPI: - bOurStep = ((step / inputrec->nstlist) % nnodes == cr->nodeid); - break; - case eiTPIC: - bOurStep = (step % nnodes == cr->nodeid); - break; - default: - gmx_fatal(FARGS, "Unknown integrator %s", ei_names[inputrec->eI]); + fprintf(debug, "Found an SSE overflow, assuming the energy is out of bounds\n"); } + bEnergyOutOfBounds = TRUE; } - if (bOurStep) - { - /* Clear some matrix variables */ - clear_mat(force_vir); - clear_mat(shake_vir); - clear_mat(vir); - clear_mat(pres); - - /* Set the charge group center of mass of the test particle */ - copy_rvec(x_init, fr->cg_cm[top->cgs.nr-1]); - - /* Calc energy (no forces) on new positions. - * Since we only need the intermolecular energy - * and the RF exclusion terms of the inserted molecule occur - * within a single charge group we can pass NULL for the graph. - * This also avoids shifts that would move charge groups - * out of the box. - * - * Some checks above ensure than we can not have - * twin-range interactions together with nstlist > 1, - * therefore we do not need to remember the LR energies. - */ - /* Make do_force do a single node force calculation */ - cr->nnodes = 1; - do_force(fplog, cr, inputrec, - step, nrnb, wcycle, top, &top_global->groups, - state->box, state->x, &state->hist, - f, force_vir, mdatoms, enerd, fcd, - state->lambda, - NULL, fr, NULL, mu_tot, t, NULL, NULL, FALSE, - GMX_FORCE_NONBONDED | GMX_FORCE_ENERGY | - (bNS ? GMX_FORCE_DYNAMICBOX | GMX_FORCE_NS | GMX_FORCE_DO_LR : 0) | - (bStateChanged ? GMX_FORCE_STATECHANGED : 0)); - cr->nnodes = nnodes; - bStateChanged = FALSE; - bNS = FALSE; - - /* Calculate long range corrections to pressure and energy */ - calc_dispcorr(fplog, inputrec, fr, step, top_global->natoms, state->box, - lambda, pres, vir, &prescorr, &enercorr, &dvdlcorr); - /* figure out how to rearrange the next 4 lines MRS 8/4/2009 */ - enerd->term[F_DISPCORR] = enercorr; - enerd->term[F_EPOT] += enercorr; - enerd->term[F_PRES] += prescorr; - enerd->term[F_DVDL_VDW] += dvdlcorr; - - epot = enerd->term[F_EPOT]; - bEnergyOutOfBounds = FALSE; -#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER - /* With SSE the energy can overflow, check for this */ - if (gmx_mm_check_and_reset_overflow()) - { - if (debug) - { - fprintf(debug, "Found an SSE overflow, assuming the energy is out of bounds\n"); - } - bEnergyOutOfBounds = TRUE; - } #endif - /* If the compiler doesn't optimize this check away - * we catch the NAN energies. - * The epot>GMX_REAL_MAX check catches inf values, - * which should nicely result in embU=0 through the exp below, - * but it does not hurt to check anyhow. - */ - /* Non-bonded Interaction usually diverge at r=0. - * With tabulated interaction functions the first few entries - * should be capped in a consistent fashion between - * repulsion, dispersion and Coulomb to avoid accidental - * negative values in the total energy. - * The table generation code in tables.c does this. - * With user tbales the user should take care of this. - */ - if (epot != epot || epot > GMX_REAL_MAX) + /* If the compiler doesn't optimize this check away + * we catch the NAN energies. + * The epot>GMX_REAL_MAX check catches inf values, + * which should nicely result in embU=0 through the exp below, + * but it does not hurt to check anyhow. + */ + /* Non-bonded Interaction usually diverge at r=0. + * With tabulated interaction functions the first few entries + * should be capped in a consistent fashion between + * repulsion, dispersion and Coulomb to avoid accidental + * negative values in the total energy. + * The table generation code in tables.c does this. + * With user tbales the user should take care of this. + */ + if (epot != epot || epot > GMX_REAL_MAX) + { + bEnergyOutOfBounds = TRUE; + } + if (bEnergyOutOfBounds) + { + if (debug) { - bEnergyOutOfBounds = TRUE; + fprintf(debug, "\n time %.3f, step %d: non-finite energy %f, using exp(-bU)=0\n", t, (int)step, epot); } - if (bEnergyOutOfBounds) + embU = 0; + } + else + { + embU = exp(-beta*epot); + sum_embU += embU; + /* Determine the weighted energy contributions of each energy group */ + e = 0; + sum_UgembU[e++] += epot*embU; + if (fr->bBHAM) { - if (debug) + for (i = 0; i < ngid; i++) { - fprintf(debug, "\n time %.3f, step %d: non-finite energy %f, using exp(-bU)=0\n", t, step, epot); + sum_UgembU[e++] += + (enerd->grpp.ener[egBHAMSR][GID(i, gid_tp, ngid)] + + enerd->grpp.ener[egBHAMLR][GID(i, gid_tp, ngid)])*embU; } - embU = 0; } else { - embU = exp(-beta*epot); - sum_embU += embU; - /* Determine the weighted energy contributions of each energy group */ - e = 0; - sum_UgembU[e++] += epot*embU; - if (fr->bBHAM) - { - for (i = 0; i < ngid; i++) - { - sum_UgembU[e++] += - (enerd->grpp.ener[egBHAMSR][GID(i, gid_tp, ngid)] + - enerd->grpp.ener[egBHAMLR][GID(i, gid_tp, ngid)])*embU; - } - } - else - { - for (i = 0; i < ngid; i++) - { - sum_UgembU[e++] += - (enerd->grpp.ener[egLJSR][GID(i, gid_tp, ngid)] + - enerd->grpp.ener[egLJLR][GID(i, gid_tp, ngid)])*embU; - } - } - if (bDispCorr) + for (i = 0; i < ngid; i++) { - sum_UgembU[e++] += enerd->term[F_DISPCORR]*embU; - } - if (bCharge) - { - for (i = 0; i < ngid; i++) - { - sum_UgembU[e++] += - (enerd->grpp.ener[egCOULSR][GID(i, gid_tp, ngid)] + - enerd->grpp.ener[egCOULLR][GID(i, gid_tp, ngid)])*embU; - } - if (bRFExcl) - { - sum_UgembU[e++] += enerd->term[F_RF_EXCL]*embU; - } - if (EEL_FULL(fr->eeltype)) - { - sum_UgembU[e++] += enerd->term[F_COUL_RECIP]*embU; - } + sum_UgembU[e++] += + (enerd->grpp.ener[egLJSR][GID(i, gid_tp, ngid)] + + enerd->grpp.ener[egLJLR][GID(i, gid_tp, ngid)])*embU; } } - - if (embU == 0 || beta*epot > bU_bin_limit) + if (bDispCorr) { - bin[0]++; + sum_UgembU[e++] += enerd->term[F_DISPCORR]*embU; } - else + if (bCharge) { - i = (int)((bU_logV_bin_limit - - (beta*epot - logV + refvolshift))*invbinw - + 0.5); - if (i < 0) + for (i = 0; i < ngid; i++) + { + sum_UgembU[e++] += + (enerd->grpp.ener[egCOULSR][GID(i, gid_tp, ngid)] + + enerd->grpp.ener[egCOULLR][GID(i, gid_tp, ngid)])*embU; + } + if (bRFExcl) { - i = 0; + sum_UgembU[e++] += enerd->term[F_RF_EXCL]*embU; } - if (i >= nbin) + if (EEL_FULL(fr->eeltype)) { - realloc_bins(&bin, &nbin, i+10); + sum_UgembU[e++] += enerd->term[F_COUL_RECIP]*embU; } - bin[i]++; } + } - if (debug) + if (embU == 0 || beta*epot > bU_bin_limit) + { + bin[0]++; + } + else + { + i = (int)((bU_logV_bin_limit + - (beta*epot - logV + refvolshift))*invbinw + + 0.5); + if (i < 0) { - fprintf(debug, "TPI %7d %12.5e %12.5f %12.5f %12.5f\n", - step, epot, x_tp[XX], x_tp[YY], x_tp[ZZ]); + i = 0; } - - if (dump_pdb && epot <= dump_ener) + if (i >= nbin) { - sprintf(str, "t%g_step%d.pdb", t, step); - sprintf(str2, "t: %f step %d ener: %f", t, step, epot); - write_sto_conf_mtop(str, str2, top_global, state->x, state->v, - inputrec->ePBC, state->box); + realloc_bins(&bin, &nbin, i+10); } + bin[i]++; + } + + if (debug) + { + fprintf(debug, "TPI %7d %12.5e %12.5f %12.5f %12.5f\n", + (int)step, epot, x_tp[XX], x_tp[YY], x_tp[ZZ]); + } + + if (dump_pdb && epot <= dump_ener) + { + sprintf(str, "t%g_step%d.pdb", t, (int)step); + sprintf(str2, "t: %f step %d ener: %f", t, (int)step, epot); + write_sto_conf_mtop(str, str2, top_global, state->x, state->v, + inputrec->ePBC, state->box); + } + + step++; + if ((step/stepblocksize) % cr->nnodes != cr->nodeid) + { + /* Skip all steps assigned to the other MPI ranks */ + step += (cr->nnodes - 1)*stepblocksize; } } diff --git a/src/gromacs/mdlib/update.c b/src/gromacs/mdlib/update.c index 2d904cba90..07dff61a03 100644 --- a/src/gromacs/mdlib/update.c +++ b/src/gromacs/mdlib/update.c @@ -91,12 +91,6 @@ typedef struct { } gmx_sd_sigma_t; typedef struct { - /* The random state for ngaussrand threads. - * Normal thermostats need just 1 random number generator, - * but SD and BD with OpenMP parallelization need 1 for each thread. - */ - int ngaussrand; - gmx_rng_t *gaussrand; /* BD stuff */ real *bd_rf; /* SD stuff */ @@ -116,11 +110,6 @@ typedef struct gmx_update rvec *xp; int xp_nalloc; - /* variable size arrays for andersen */ - gmx_bool *randatom; - int *randatom_list; - gmx_bool randatom_list_init; - /* Variables for the deform algorithm */ gmx_int64_t deformref_step; matrix deformref_box; @@ -476,43 +465,7 @@ static void do_update_visc(int start, int nrend, double dt, } } -/* Allocates and initializes sd->gaussrand[i] for i=1, ingaussrand, - * Using seeds generated from sd->gaussrand[0]. - */ -static void init_multiple_gaussrand(gmx_stochd_t *sd) -{ - int ngr, i; - unsigned int *seed; - - ngr = sd->ngaussrand; - snew(seed, ngr); - - for (i = 1; i < ngr; i++) - { - seed[i] = gmx_rng_uniform_uint32(sd->gaussrand[0]); - } - - if (ngr != gmx_omp_nthreads_get(emntUpdate)) - { - gmx_incons("The number of Gaussian number generators should be equal to gmx_omp_nthreads_get(emntUpdate)"); - } - -#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate)) - { - int th; - - th = gmx_omp_get_thread_num(); - if (th > 0) - { - /* Initialize on each thread to get memory allocated thread-local */ - sd->gaussrand[th] = gmx_rng_init(seed[th]); - } - } - - sfree(seed); -} - -static gmx_stochd_t *init_stochd(t_inputrec *ir, int nthreads) +static gmx_stochd_t *init_stochd(t_inputrec *ir) { gmx_stochd_t *sd; gmx_sd_const_t *sdc; @@ -521,30 +474,6 @@ static gmx_stochd_t *init_stochd(t_inputrec *ir, int nthreads) snew(sd, 1); - /* Initiate random number generator for langevin type dynamics, - * for BD, SD or velocity rescaling temperature coupling. - */ - if (ir->eI == eiBD || EI_SD(ir->eI)) - { - sd->ngaussrand = nthreads; - } - else - { - sd->ngaussrand = 1; - } - snew(sd->gaussrand, sd->ngaussrand); - - /* Initialize the first random generator */ - sd->gaussrand[0] = gmx_rng_init(ir->ld_seed); - - if (sd->ngaussrand > 1) - { - /* Initialize the rest of the random number generators, - * using the first one to generate seeds. - */ - init_multiple_gaussrand(sd); - } - ngtc = ir->opts.ngtc; if (ir->eI == eiBD) @@ -628,42 +557,6 @@ static gmx_stochd_t *init_stochd(t_inputrec *ir, int nthreads) return sd; } -void get_stochd_state(gmx_update_t upd, t_state *state) -{ - /* Note that we only get the state of the first random generator, - * even if there are multiple. This avoids repetition. - */ - gmx_rng_get_state(upd->sd->gaussrand[0], state->ld_rng, state->ld_rngi); -} - -void set_stochd_state(gmx_update_t upd, t_state *state) -{ - gmx_stochd_t *sd; - int i; - - sd = upd->sd; - - gmx_rng_set_state(sd->gaussrand[0], state->ld_rng, state->ld_rngi[0]); - - if (sd->ngaussrand > 1) - { - /* We only end up here with SD or BD with OpenMP. - * Destroy and reinitialize the rest of the random number generators, - * using seeds generated from the first one. - * Although this doesn't recover the previous state, - * it at least avoids repetition, which is most important. - * Exaclty restoring states with all MPI+OpenMP setups is difficult - * and as the integrator is random to start with, doesn't gain us much. - */ - for (i = 1; i < sd->ngaussrand; i++) - { - gmx_rng_destroy(sd->gaussrand[i]); - } - - init_multiple_gaussrand(sd); - } -} - gmx_update_t init_update(t_inputrec *ir) { t_gmx_update *upd; @@ -672,27 +565,24 @@ gmx_update_t init_update(t_inputrec *ir) if (ir->eI == eiBD || EI_SD(ir->eI) || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc)) { - upd->sd = init_stochd(ir, gmx_omp_nthreads_get(emntUpdate)); + upd->sd = init_stochd(ir); } - upd->xp = NULL; - upd->xp_nalloc = 0; - upd->randatom = NULL; - upd->randatom_list = NULL; - upd->randatom_list_init = FALSE; /* we have not yet cleared the data structure at this point */ + upd->xp = NULL; + upd->xp_nalloc = 0; return upd; } static void do_update_sd1(gmx_stochd_t *sd, - gmx_rng_t gaussrand, int start, int nrend, double dt, rvec accel[], ivec nFreeze[], real invmass[], unsigned short ptype[], unsigned short cFREEZE[], unsigned short cACC[], unsigned short cTC[], rvec x[], rvec xprime[], rvec v[], rvec f[], - int ngtc, real tau_t[], real ref_t[]) + int ngtc, real tau_t[], real ref_t[], + gmx_int64_t step, int seed, int* gatindex) { gmx_sd_const_t *sdc; gmx_sd_sigma_t *sig; @@ -713,6 +603,8 @@ static void do_update_sd1(gmx_stochd_t *sd, for (n = start; n < nrend; n++) { + real rnd[3]; + int ng = gatindex ? gatindex[n] : n; ism = sqrt(invmass[n]); if (cFREEZE) { @@ -727,11 +619,12 @@ static void do_update_sd1(gmx_stochd_t *sd, gt = cTC[n]; } + gmx_rng_cycle_3gaussian_table(step, ng, seed, RND_SEED_UPDATE, rnd); for (d = 0; d < DIM; d++) { if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d]) { - sd_V = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand); + sd_V = ism*sig[gt].V*rnd[d]; v[n][d] = v[n][d]*sdc[gt].em + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em) @@ -783,7 +676,6 @@ static void do_update_sd2_Tconsts(gmx_stochd_t *sd, } static void do_update_sd2(gmx_stochd_t *sd, - gmx_rng_t gaussrand, gmx_bool bInitStep, int start, int nrend, rvec accel[], ivec nFreeze[], @@ -793,7 +685,8 @@ static void do_update_sd2(gmx_stochd_t *sd, rvec x[], rvec xprime[], rvec v[], rvec f[], rvec sd_X[], const real tau_t[], - gmx_bool bFirstHalf) + gmx_bool bFirstHalf, gmx_int64_t step, int seed, + int* gatindex) { gmx_sd_const_t *sdc; gmx_sd_sigma_t *sig; @@ -805,7 +698,7 @@ static void do_update_sd2(gmx_stochd_t *sd, int gf = 0, ga = 0, gt = 0; real vn = 0, Vmh, Xmh; real ism; - int n, d; + int n, d, ng; sdc = sd->sdc; sig = sd->sdsig; @@ -813,6 +706,8 @@ static void do_update_sd2(gmx_stochd_t *sd, for (n = start; n < nrend; n++) { + real rnd[6], rndi[3]; + ng = gatindex ? gatindex[n] : n; ism = sqrt(invmass[n]); if (cFREEZE) { @@ -827,6 +722,11 @@ static void do_update_sd2(gmx_stochd_t *sd, gt = cTC[n]; } + gmx_rng_cycle_6gaussian_table(step*2+(bFirstHalf ? 1 : 2), ng, seed, RND_SEED_UPDATE, rnd); + if (bInitStep) + { + gmx_rng_cycle_3gaussian_table(step*2, ng, seed, RND_SEED_UPDATE, rndi); + } for (d = 0; d < DIM; d++) { if (bFirstHalf) @@ -839,11 +739,11 @@ static void do_update_sd2(gmx_stochd_t *sd, { if (bInitStep) { - sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand); + sd_X[n][d] = ism*sig[gt].X*rndi[d]; } Vmh = sd_X[n][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c) - + ism*sig[gt].Yv*gmx_rng_gaussian_table(gaussrand); - sd_V[n][d] = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand); + + ism*sig[gt].Yv*rnd[d*2]; + sd_V[n][d] = ism*sig[gt].V*rnd[d*2+1]; v[n][d] = vn*sdc[gt].em + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em) @@ -853,7 +753,6 @@ static void do_update_sd2(gmx_stochd_t *sd, } else { - /* Correct the velocities for the constraints. * This operation introduces some inaccuracy, * since the velocity is determined from differences in coordinates. @@ -862,8 +761,8 @@ static void do_update_sd2(gmx_stochd_t *sd, (xprime[n][d] - x[n][d])/(tau_t[gt]*(sdc[gt].eph - sdc[gt].emh)); Xmh = sd_V[n][d]*tau_t[gt]*sdc[gt].d/(sdc[gt].em-1) - + ism*sig[gt].Yx*gmx_rng_gaussian_table(gaussrand); - sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand); + + ism*sig[gt].Yx*rnd[d*2]; + sd_X[n][d] = ism*sig[gt].X*rnd[d*2+1]; xprime[n][d] += sd_X[n][d] - Xmh; @@ -910,7 +809,8 @@ static void do_update_bd(int start, int nrend, double dt, unsigned short cFREEZE[], unsigned short cTC[], rvec x[], rvec xprime[], rvec v[], rvec f[], real friction_coefficient, - real *rf, gmx_rng_t gaussrand) + real *rf, gmx_int64_t step, int seed, + int* gatindex) { /* note -- these appear to be full step velocities . . . */ int gf = 0, gt = 0; @@ -925,6 +825,9 @@ static void do_update_bd(int start, int nrend, double dt, for (n = start; (n < nrend); n++) { + real rnd[3]; + int ng = gatindex ? gatindex[n] : n; + if (cFREEZE) { gf = cFREEZE[n]; @@ -933,19 +836,20 @@ static void do_update_bd(int start, int nrend, double dt, { gt = cTC[n]; } + gmx_rng_cycle_3gaussian_table(step, ng, seed, RND_SEED_UPDATE, rnd); for (d = 0; (d < DIM); d++) { if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d]) { if (friction_coefficient != 0) { - vn = invfr*f[n][d] + rf[gt]*gmx_rng_gaussian_table(gaussrand); + vn = invfr*f[n][d] + rf[gt]*rnd[d]; } else { /* NOTE: invmass = 2/(mass*friction_constant*dt) */ vn = 0.5*invmass[n]*f[n][d]*dt - + sqrt(0.5*invmass[n])*rf[gt]*gmx_rng_gaussian_table(gaussrand); + + sqrt(0.5*invmass[n])*rf[gt]*rnd[d]; } v[n][d] = vn; @@ -1376,7 +1280,6 @@ void update_tcouple(gmx_int64_t step, t_inputrec *inputrec, t_state *state, gmx_ekindata_t *ekind, - gmx_update_t upd, t_extmass *MassQ, t_mdatoms *md) @@ -1421,8 +1324,8 @@ void update_tcouple(gmx_int64_t step, state->nosehoover_xi, state->nosehoover_vxi, MassQ); break; case etcVRESCALE: - vrescale_tcoupl(inputrec, ekind, dttc, - state->therm_integral, upd->sd->gaussrand[0]); + vrescale_tcoupl(inputrec, step, ekind, dttc, + state->therm_integral); break; } /* rescale in place here */ @@ -1652,14 +1555,15 @@ void update_constraints(FILE *fplog, end_th = start + ((nrend-start)*(th+1))/nth; /* The second part of the SD integration */ - do_update_sd2(upd->sd, upd->sd->gaussrand[th], + do_update_sd2(upd->sd, FALSE, start_th, end_th, inputrec->opts.acc, inputrec->opts.nFreeze, md->invmass, md->ptype, md->cFREEZE, md->cACC, md->cTC, state->x, xprime, state->v, force, state->sd_X, inputrec->opts.tau_t, - FALSE); + FALSE, step, inputrec->ld_seed, + DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL); } inc_nrnb(nrnb, eNR_UPDATE, homenr); @@ -1951,26 +1855,28 @@ void update_coords(FILE *fplog, } break; case (eiSD1): - do_update_sd1(upd->sd, upd->sd->gaussrand[th], + do_update_sd1(upd->sd, start_th, end_th, dt, inputrec->opts.acc, inputrec->opts.nFreeze, md->invmass, md->ptype, md->cFREEZE, md->cACC, md->cTC, state->x, xprime, state->v, force, - inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t); + inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t, + step, inputrec->ld_seed, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL); break; case (eiSD2): /* The SD update is done in 2 parts, because an extra constraint step * is needed */ - do_update_sd2(upd->sd, upd->sd->gaussrand[th], + do_update_sd2(upd->sd, bInitStep, start_th, end_th, inputrec->opts.acc, inputrec->opts.nFreeze, md->invmass, md->ptype, md->cFREEZE, md->cACC, md->cTC, state->x, xprime, state->v, force, state->sd_X, inputrec->opts.tau_t, - TRUE); + TRUE, step, inputrec->ld_seed, + DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL); break; case (eiBD): do_update_bd(start_th, end_th, dt, @@ -1978,7 +1884,8 @@ void update_coords(FILE *fplog, md->cFREEZE, md->cTC, state->x, xprime, state->v, force, inputrec->bd_fric, - upd->sd->bd_rf, upd->sd->gaussrand[th]); + upd->sd->bd_rf, + step, inputrec->ld_seed, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL); break; case (eiVV): case (eiVVAK): @@ -2066,31 +1973,23 @@ void correct_ekin(FILE *log, int start, int end, rvec v[], rvec vcm, real mass[] mv[XX], mv[YY], mv[ZZ]); } -extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_int64_t step, t_mdatoms *md, t_state *state, gmx_update_t upd, t_idef *idef, gmx_constr_t constr) +extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_int64_t step, const t_commrec *cr, + t_mdatoms *md, t_state *state, gmx_update_t upd, gmx_constr_t constr) { int i; real rate = (ir->delta_t)/ir->opts.tau_t[0]; + + if (ir->etc == etcANDERSEN && constr != NULL) + { + gmx_fatal(FARGS, "Normal Andersen is currently not supported with constraints, use massive Andersen instead"); + } + /* proceed with andersen if 1) it's fixed probability per particle andersen or 2) it's massive andersen and it's tau_t/dt */ if ((ir->etc == etcANDERSEN) || do_per_step(step, (int)(1.0/rate))) { - srenew(upd->randatom, state->nalloc); - srenew(upd->randatom_list, state->nalloc); - if (upd->randatom_list_init == FALSE) - { - for (i = 0; i < state->nalloc; i++) - { - upd->randatom[i] = FALSE; - upd->randatom_list[i] = 0; - } - upd->randatom_list_init = TRUE; - } - andersen_tcoupl(ir, md, state, upd->sd->gaussrand[0], rate, - (ir->etc == etcANDERSEN) ? idef : NULL, - constr ? get_nblocks(constr) : 0, - constr ? get_sblock(constr) : NULL, - upd->randatom, upd->randatom_list, + andersen_tcoupl(ir, step, cr, md, state, rate, upd->sd->randomize_group, upd->sd->boltzfac); return TRUE; } diff --git a/src/gromacs/random/random.c b/src/gromacs/random/random.c index ab103a9d7a..b70fddfcde 100644 --- a/src/gromacs/random/random.c +++ b/src/gromacs/random/random.c @@ -51,6 +51,8 @@ #include #endif +#include "external/Random123-1.08/include/Random123/threefry.h" + #include "gromacs/math/utilities.h" #include "random_gausstable.h" @@ -60,15 +62,15 @@ #define RNG_UPPER_MASK 0x80000000UL /* most significant w-r bits */ #define RNG_LOWER_MASK 0x7fffffffUL /* least significant r bits */ -/* Note that if you change the size of the Gaussian table you will also - * have to generate new initialization data for the table in +/* Note that if you change the size of the Gaussian table you will + * also have to generate new initialization data for the table in * gmx_random_gausstable.h * * A routine print_gaussian_table() is in contrib/random.c * for convenience - use it if you need a different size of the table. */ #define GAUSS_TABLE 14 /* the size of the gauss table is 2^GAUSS_TABLE */ -#define GAUSS_SHIFT (32 - GAUSS_TABLE) +#define GAUSS_MASK ((1 << GAUSS_TABLE) - 1) struct gmx_rng { @@ -387,10 +389,8 @@ gmx_rng_uniform_real(gmx_rng_t rng) * we are limited to an accuracy of 1e-7. */ } - - - real + gmx_rng_gaussian_table(gmx_rng_t rng) { unsigned int i; @@ -398,5 +398,52 @@ gmx_rng_gaussian_table(gmx_rng_t rng) i = gmx_rng_uniform_uint32(rng); /* The Gaussian table is a static constant in this file */ - return gaussian_table[i >> GAUSS_SHIFT]; + return gaussian_table[i >> (32 - GAUSS_TABLE)]; +} + +void +gmx_rng_cycle_2uniform(gmx_int64_t ctr1, gmx_int64_t ctr2, + gmx_int64_t key1, gmx_int64_t key2, + double* rnd) +{ + const gmx_int64_t mask_53bits = 0x1FFFFFFFFFFFFF; + const double two_power_min53 = 1.0/9007199254740992.0; + + threefry2x64_ctr_t ctr = {{ctr1, ctr2}}; + threefry2x64_key_t key = {{key1, key2}}; + threefry2x64_ctr_t rand = threefry2x64(ctr, key); + + rnd[0] = (rand.v[0] & mask_53bits)*two_power_min53; + rnd[1] = (rand.v[1] & mask_53bits)*two_power_min53; +} + +void +gmx_rng_cycle_3gaussian_table(gmx_int64_t ctr1, gmx_int64_t ctr2, + gmx_int64_t key1, gmx_int64_t key2, + real* rnd) +{ + threefry2x64_ctr_t ctr = {{ctr1, ctr2}}; + threefry2x64_key_t key = {{key1, key2}}; + threefry2x64_ctr_t rand = threefry2x64(ctr, key); + + rnd[0] = gaussian_table[(rand.v[0] >> 48) & GAUSS_MASK]; + rnd[1] = gaussian_table[(rand.v[0] >> 32) & GAUSS_MASK]; + rnd[2] = gaussian_table[(rand.v[0] >> 16) & GAUSS_MASK]; +} + +void +gmx_rng_cycle_6gaussian_table(gmx_int64_t ctr1, gmx_int64_t ctr2, + gmx_int64_t key1, gmx_int64_t key2, + real* rnd) +{ + threefry2x64_ctr_t ctr = {{ctr1, ctr2}}; + threefry2x64_key_t key = {{key1, key2}}; + threefry2x64_ctr_t rand = threefry2x64(ctr, key); + + rnd[0] = gaussian_table[(rand.v[0] >> 48) & GAUSS_MASK]; + rnd[1] = gaussian_table[(rand.v[0] >> 32) & GAUSS_MASK]; + rnd[2] = gaussian_table[(rand.v[0] >> 16) & GAUSS_MASK]; + rnd[3] = gaussian_table[(rand.v[1] >> 48) & GAUSS_MASK]; + rnd[4] = gaussian_table[(rand.v[1] >> 32) & GAUSS_MASK]; + rnd[5] = gaussian_table[(rand.v[1] >> 16) & GAUSS_MASK]; } diff --git a/src/gromacs/random/random.h b/src/gromacs/random/random.h index 739aaf71c4..88dad71fcf 100644 --- a/src/gromacs/random/random.h +++ b/src/gromacs/random/random.h @@ -45,6 +45,14 @@ extern "C" { #endif +/*! Fixed random number seeds for different types of RNG */ +#define RND_SEED_UPDATE 1 /*!< For coordinate update (sd, bd, ..) */ +#define RND_SEED_REPLEX 2 /*!< For replica exchange */ +#define RND_SEED_VRESCALE 3 /*!< For V-rescale thermostat */ +#define RND_SEED_ANDERSEN 4 /*!< For Andersen thermostat */ +#define RND_SEED_TPI 5 /*!< For test particle insertion */ +#define RND_SEED_EXPANDED 6 /*!< For expanded emseble methods */ + /*! \brief Abstract datatype for a random number generator * * This is a handle to the full state of a random number generator. @@ -260,6 +268,54 @@ gmx_rng_gaussian_real(gmx_rng_t rng); real gmx_rng_gaussian_table(gmx_rng_t rng); + +/* The stateless cycle based random number generators below, + * which all use threefry2x64, take the following arguments: + * + * ctr1: In mdrun the step counter, in tools the frame(-step) + * counter, so we can ensure reproducible results, even + * we starting at different steps/frames. Might need to be + * multiplied by a constant if we need more random numbers. + * ctr2: A local counter, in mdrun often a global atom index. + * If any algorithm needs a variable number of random numbers, + * the second counter is usually a function of the local + * counter. + * key1: A user provided random seed. + * key2: A fixed seed which is particular for the algorithm, + * as defined at the top of this file, to ensure different + * random sequences when the same user seed is used for + * different algorithms. + */ + +/* Return two uniform random numbers with 2^53 equally + * probable values between 0 and 1 - 2^-53. + * It uses a stateless counter based random number generator + * (threefry2x64). + */ +void +gmx_rng_cycle_2uniform(gmx_int64_t ctr1, gmx_int64_t ctr2, + gmx_int64_t key1, gmx_int64_t key2, + double* rnd); + +/* Return three Gaussian random numbers with expectation value + * 0.0 and standard deviation 1.0. This routine uses a table + * lookup for maximum speed. It uses a stateless counter + * based random number generator (threefry2x64). See + * gmx_rng_gaussian_table for a warning about accuracy of the table. + * + * threadsafe: yes + */ +void +gmx_rng_cycle_3gaussian_table(gmx_int64_t ctr1, gmx_int64_t ctr2, + gmx_int64_t key1, gmx_int64_t key2, + real* rnd); + +/* As gmx_rng_3gaussian_table, but returns 6 Gaussian numbers. */ +void +gmx_rng_cycle_6gaussian_table(gmx_int64_t ctr1, gmx_int64_t ctr2, + gmx_int64_t key1, gmx_int64_t key2, + real* rnd); + #ifdef __cplusplus } #endif diff --git a/src/gromacs/tools/convert_tpr.c b/src/gromacs/tools/convert_tpr.c index 423639082e..c3297ed7cc 100644 --- a/src/gromacs/tools/convert_tpr.c +++ b/src/gromacs/tools/convert_tpr.c @@ -51,7 +51,6 @@ #include "gromacs/commandline/pargs.h" #include "vec.h" #include "mtop_util.h" -#include "gromacs/random/random.h" #include "checkpoint.h" #include "gromacs/fileio/tpxio.h" #include "gromacs/fileio/trnio.h" @@ -464,13 +463,6 @@ int gmx_convert_tpr(int argc, char *argv[]) "If you want that, supply a checkpoint file to mdrun\n\n"); } - if (EI_SD(ir->eI) || ir->eI == eiBD) - { - fprintf(stderr, "\nChanging ld-seed from %d ", ir->ld_seed); - ir->ld_seed = (int)gmx_rng_make_seed(); - fprintf(stderr, "to %d\n\n", ir->ld_seed); - } - frame_fn = ftp2fn(efTRN, NFILE, fnm); if (fn2ftp(frame_fn) == efCPT) diff --git a/src/programs/mdrun/md.c b/src/programs/mdrun/md.c index 04a4f7bb63..3ae70f5038 100644 --- a/src/programs/mdrun/md.c +++ b/src/programs/mdrun/md.c @@ -182,7 +182,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], gmx_update_t upd = NULL; t_graph *graph = NULL; globsig_t gs; - gmx_rng_t mcrng = NULL; gmx_groups_t *groups; gmx_ekindata_t *ekind, *ekind_save; gmx_shellfc_t shellfc; @@ -413,7 +412,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], if (ir->bExpanded) { - init_expanded_ensemble(bStateFromCP, ir, &mcrng, &state->dfhist); + init_expanded_ensemble(bStateFromCP, ir, &state->dfhist); } if (MASTER(cr)) @@ -438,17 +437,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], update_energyhistory(&state_global->enerhist, mdebin); } - if ((state->flags & (1<flags & (1<fep_state, &state->dfhist, step, mcrng, state->v, mdatoms); + lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, &state->dfhist, step, state->v, mdatoms); /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */ copy_df_history(&state_global->dfhist, &state->dfhist); } @@ -1297,9 +1285,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], * the update. */ do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, - ir, state, state_global, top_global, fr, upd, + ir, state, state_global, top_global, fr, outf, mdebin, ekind, f, f_global, - wcycle, mcrng, &nchkpt, + wcycle, &nchkpt, bCPT, bRerunMD, bLastStep, (Flags & MD_CONFOUT), bSumEkinhOld); @@ -1408,12 +1396,12 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], { if (!bInitStep) { - update_tcouple(step, ir, state, ekind, upd, &MassQ, mdatoms); + update_tcouple(step, ir, state, ekind, &MassQ, mdatoms); } if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */ { gmx_bool bIfRandomize; - bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr); + bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state, upd, constr); /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */ if (constr && bIfRandomize) { @@ -1486,7 +1474,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], } else { - update_tcouple(step, ir, state, ekind, upd, &MassQ, mdatoms); + update_tcouple(step, ir, state, ekind, &MassQ, mdatoms); update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep); } diff --git a/src/programs/mdrun/repl_ex.c b/src/programs/mdrun/repl_ex.c index 409f525ab0..21a5426c09 100644 --- a/src/programs/mdrun/repl_ex.c +++ b/src/programs/mdrun/repl_ex.c @@ -49,6 +49,7 @@ #include "vec.h" #include "names.h" #include "domdec.h" +#include "gromacs/random/random.h" #define PROBABILITYCUTOFF 100 /* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */ @@ -974,18 +975,22 @@ test_for_replica_exchange(FILE *fplog, if (bMultiEx) { /* multiple random switch exchange */ - for (i = 0; i < re->nex; i++) + int nself = 0; + for (i = 0; i < re->nex + nself; i++) { + double rnd[2]; + + gmx_rng_cycle_2uniform(step, i*2, re->seed, RND_SEED_REPLEX, rnd); /* randomly select a pair */ /* in theory, could reduce this by identifying only which switches had a nonneglibible probability of occurring (log p > -100) and only operate on those switches */ /* find out which state it is from, and what label that state currently has. Likely more work that useful. */ - i0 = (int)(re->nrepl*gmx_rng_uniform_real(re->rng)); - i1 = (int)(re->nrepl*gmx_rng_uniform_real(re->rng)); + i0 = (int)(re->nrepl*rnd[0]); + i1 = (int)(re->nrepl*rnd[1]); if (i0 == i1) { - i--; + nself++; continue; /* self-exchange, back up and do it again */ } @@ -1021,7 +1026,8 @@ test_for_replica_exchange(FILE *fplog, prob[0] = exp(-delta); } /* roll a number to determine if accepted */ - bEx[0] = (gmx_rng_uniform_real(re->rng) < prob[0]); + gmx_rng_cycle_2uniform(step, i*2+1, re->seed, RND_SEED_REPLEX, rnd); + bEx[0] = rnd[0] < prob[0]; } re->prob_sum[0] += prob[0]; @@ -1039,6 +1045,7 @@ test_for_replica_exchange(FILE *fplog, else { /* standard nearest neighbor replica exchange */ + m = (step / re->nst) % 2; for (i = 1; i < re->nrepl; i++) { @@ -1057,6 +1064,8 @@ test_for_replica_exchange(FILE *fplog, } else { + double rnd[2]; + if (delta > PROBABILITYCUTOFF) { prob[i] = 0; @@ -1066,7 +1075,8 @@ test_for_replica_exchange(FILE *fplog, prob[i] = exp(-delta); } /* roll a number to determine if accepted */ - bEx[i] = (gmx_rng_uniform_real(re->rng) < prob[i]); + gmx_rng_cycle_2uniform(step, i, re->seed, RND_SEED_REPLEX, rnd); + bEx[i] = rnd[0] < prob[i]; } re->prob_sum[i] += prob[i]; diff --git a/src/programs/mdrun/runner.c b/src/programs/mdrun/runner.c index 9d6211b036..767d0bb875 100644 --- a/src/programs/mdrun/runner.c +++ b/src/programs/mdrun/runner.c @@ -1097,7 +1097,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt, int i, m, nChargePerturbed = -1, nTypePerturbed = 0, status, nalloc; char *gro; gmx_wallcycle_t wcycle; - gmx_bool bReadRNG, bReadEkin; + gmx_bool bReadEkin; int list; gmx_walltime_accounting_t walltime_accounting = NULL; int rc; @@ -1299,7 +1299,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt, } /* now make sure the state is initialized and propagated */ - set_state_entries(state, inputrec, cr->nnodes); + set_state_entries(state, inputrec); /* A parallel command line option consistency check that we can only do after any threads have started. */ @@ -1397,14 +1397,10 @@ int mdrunner(gmx_hw_opt_t *hw_opt, { load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog, cr, ddxyz, - inputrec, state, &bReadRNG, &bReadEkin, + inputrec, state, &bReadEkin, (Flags & MD_APPENDFILES), (Flags & MD_APPENDFILESSET)); - if (bReadRNG) - { - Flags |= MD_READ_RNG; - } if (bReadEkin) { Flags |= MD_READ_EKIN; diff --git a/tests/CppCheck.cmake b/tests/CppCheck.cmake index dbb33e2e82..918b6304d4 100644 --- a/tests/CppCheck.cmake +++ b/tests/CppCheck.cmake @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2013, by the GROMACS development team, led by +# Copyright (c) 2013,2014, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -84,6 +84,7 @@ if (CPPCHECK_EXECUTABLE AND UNIX) --suppress=invalidscanf --suppress=sizeofCalculation --suppress=missingInclude:src/programs/mdrun/gmx_gpu_utils/gmx_gpu_utils.cu + --suppress=*:src/external/Random123-1.08/include/Random123/features/compilerfeatures.h --inline-suppr) set(_cxx_flags -D__cplusplus -- 2.22.0