--- /dev/null
+/** @page LICENSE
+Copyright 2010-2012, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _r123array_dot_h__
+#define _r123array_dot_h__
+#include "features/compilerfeatures.h"
+#include "features/sse.h"
+
+#ifndef __cplusplus
+#define CXXMETHODS(_N, W, T)
+#define CXXOVERLOADS(_N, W, T)
+#else
+
+#include <stddef.h>
+#include <algorithm>
+#include <stdexcept>
+#include <iterator>
+#include <limits>
+#include <iostream>
+
+/** @defgroup arrayNxW The r123arrayNxW classes
+
+ Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers.
+ It is functionally equivalent to the C++0x std::array<N, uintW_t>,
+ but does not require C++0x features or libraries.
+
+ In addition to meeting most of the requirements of a Container,
+ it also has a member function, incr(), which increments the zero-th
+ element and carrys overflows into higher indexed elements. Thus,
+ by using incr(), sequences of up to 2^(N*W) distinct values
+ can be produced.
+
+ If SSE is supported by the compiler, then the class
+ r123array1xm128i is also defined, in which the data member is an
+ array of one r123128i object.
+
+ @cond HIDDEN_FROM_DOXYGEN
+*/
+
+template <typename value_type>
+inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){
+ value_type v=0;
+ for(size_t i=0; i<(3+sizeof(value_type))/4; ++i)
+ v |= ((value_type)(*p32++)) << (32*i);
+ return v;
+}
+
+// Work-alike methods and typedefs modeled on std::array:
+#define CXXMETHODS(_N, W, T) \
+ typedef T value_type; \
+ typedef T* iterator; \
+ typedef const T* const_iterator; \
+ typedef value_type& reference; \
+ typedef const value_type& const_reference; \
+ typedef size_t size_type; \
+ typedef ptrdiff_t difference_type; \
+ typedef T* pointer; \
+ typedef const T* const_pointer; \
+ typedef std::reverse_iterator<iterator> reverse_iterator; \
+ typedef std::reverse_iterator<const_iterator> const_reverse_iterator; \
+ /* Boost.array has static_size. C++11 specializes tuple_size */ \
+ enum {static_size = _N}; \
+ R123_CUDA_DEVICE reference operator[](size_type i){return v[i];} \
+ R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];} \
+ R123_CUDA_DEVICE reference at(size_type i){ if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
+ R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
+ R123_CUDA_DEVICE size_type size() const { return _N; } \
+ R123_CUDA_DEVICE size_type max_size() const { return _N; } \
+ R123_CUDA_DEVICE bool empty() const { return _N==0; }; \
+ R123_CUDA_DEVICE iterator begin() { return &v[0]; } \
+ R123_CUDA_DEVICE iterator end() { return &v[_N]; } \
+ R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; } \
+ R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; } \
+ R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; } \
+ R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; } \
+ R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); } \
+ R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \
+ R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); } \
+ R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \
+ R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \
+ R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } \
+ R123_CUDA_DEVICE pointer data(){ return &v[0]; } \
+ R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; } \
+ R123_CUDA_DEVICE reference front(){ return v[0]; } \
+ R123_CUDA_DEVICE const_reference front() const{ return v[0]; } \
+ R123_CUDA_DEVICE reference back(){ return v[_N-1]; } \
+ R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; } \
+ R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \
+ /* CUDA3 does not have std::equal */ \
+ for (size_t i = 0; i < _N; ++i) \
+ if (v[i] != rhs.v[i]) return false; \
+ return true; \
+ } \
+ R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \
+ /* CUDA3 does not have std::fill_n */ \
+ R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \
+ R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \
+ /* CUDA3 does not have std::swap_ranges */ \
+ for (size_t i = 0; i < _N; ++i) { \
+ T tmp = v[i]; \
+ v[i] = rhs.v[i]; \
+ rhs.v[i] = tmp; \
+ } \
+ } \
+ R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){ \
+ /* This test is tricky because we're trying to avoid spurious \
+ complaints about illegal shifts, yet still be compile-time \
+ evaulated. */ \
+ if(sizeof(T)<sizeof(n) && n>>((sizeof(T)<sizeof(n))?8*sizeof(T):0) ) \
+ return incr_carefully(n); \
+ if(n==1){ \
+ ++v[0]; \
+ if(_N==1 || R123_BUILTIN_EXPECT(!!v[0], 1)) return *this; \
+ }else{ \
+ v[0] += n; \
+ if(_N==1 || R123_BUILTIN_EXPECT(n<=v[0], 1)) return *this; \
+ } \
+ /* We expect that the N==?? tests will be \
+ constant-folded/optimized away by the compiler, so only the \
+ overflow tests (!!v[i]) remain to be done at runtime. For \
+ small values of N, it would be better to do this as an \
+ uncondtional sequence of adc. An experiment/optimization \
+ for another day... \
+ N.B. The weird subscripting: v[_N>3?3:0] is to silence \
+ a spurious error from icpc \
+ */ \
+ ++v[_N>1?1:0]; \
+ if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \
+ ++v[_N>2?2:0]; \
+ if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this; \
+ ++v[_N>3?3:0]; \
+ for(size_t i=4; i<_N; ++i){ \
+ if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this; \
+ ++v[i]; \
+ } \
+ return *this; \
+ } \
+ /* seed(SeedSeq) would be a constructor if having a constructor */ \
+ /* didn't cause headaches with defaults */ \
+ template <typename SeedSeq> \
+ R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){ \
+ r123array##_N##x##W ret; \
+ const size_t Ngen = _N*((3+sizeof(value_type))/4); \
+ uint32_t u32[Ngen]; \
+ uint32_t *p32 = &u32[0]; \
+ ss.generate(&u32[0], &u32[Ngen]); \
+ for(size_t i=0; i<_N; ++i){ \
+ ret.v[i] = assemble_from_u32<value_type>(p32); \
+ p32 += (3+sizeof(value_type))/4; \
+ } \
+ return ret; \
+ } \
+protected: \
+ R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \
+ /* n may be greater than the maximum value of a single value_type */ \
+ value_type vtn; \
+ vtn = n; \
+ v[0] += n; \
+ const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \
+ for(size_t i=1; i<_N; ++i){ \
+ if(rshift){ \
+ n >>= rshift; \
+ }else{ \
+ n=0; \
+ } \
+ if( v[i-1] < vtn ) \
+ ++n; \
+ if( n==0 ) break; \
+ vtn = n; \
+ v[i] += n; \
+ } \
+ return *this; \
+ } \
+
+
+// There are several tricky considerations for the insertion and extraction
+// operators:
+// - we would like to be able to print r123array16x8 as a sequence of 16 integers,
+// not as 16 bytes.
+// - we would like to be able to print r123array1xm128i.
+// - we do not want an int conversion operator in r123m128i because it causes
+// lots of ambiguity problems with automatic promotions.
+// Solution: r123arrayinsertable and r123arrayextractable
+
+template<typename T>
+struct r123arrayinsertable{
+ const T& v;
+ r123arrayinsertable(const T& t_) : v(t_) {}
+ friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<T>& t){
+ return os << t.v;
+ }
+};
+
+template<>
+struct r123arrayinsertable<uint8_t>{
+ const uint8_t& v;
+ r123arrayinsertable(const uint8_t& t_) : v(t_) {}
+ friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<uint8_t>& t){
+ return os << (int)t.v;
+ }
+};
+
+template<typename T>
+struct r123arrayextractable{
+ T& v;
+ r123arrayextractable(T& t_) : v(t_) {}
+ friend std::istream& operator>>(std::istream& is, r123arrayextractable<T>& t){
+ return is >> t.v;
+ }
+};
+
+template<>
+struct r123arrayextractable<uint8_t>{
+ uint8_t& v;
+ r123arrayextractable(uint8_t& t_) : v(t_) {}
+ friend std::istream& operator>>(std::istream& is, r123arrayextractable<uint8_t>& t){
+ int i;
+ is >> i;
+ t.v = i;
+ return is;
+ }
+};
+
+#define CXXOVERLOADS(_N, W, T) \
+ \
+inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){ \
+ os << r123arrayinsertable<T>(a.v[0]); \
+ for(size_t i=1; i<_N; ++i) \
+ os << " " << r123arrayinsertable<T>(a.v[i]); \
+ return os; \
+} \
+ \
+inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){ \
+ for(size_t i=0; i<_N; ++i){ \
+ r123arrayextractable<T> x(a.v[i]); \
+ is >> x; \
+ } \
+ return is; \
+} \
+ \
+namespace r123{ \
+ typedef r123array##_N##x##W Array##_N##x##W; \
+}
+
+#endif /* __cplusplus */
+
+/* _r123array_tpl expands to a declaration of struct r123arrayNxW.
+
+ In C, it's nothing more than a struct containing an array of N
+ objects of type T.
+
+ In C++ it's the same, but endowed with an assortment of member
+ functions, typedefs and friends. In C++, r123arrayNxW looks a lot
+ like std::array<T,N>, has most of the capabilities of a container,
+ and satisfies the requirements outlined in compat/Engine.hpp for
+ counter and key types. ArrayNxW, in the r123 namespace is
+ a typedef equivalent to r123arrayNxW.
+*/
+
+#define _r123array_tpl(_N, W, T) \
+ /** @ingroup arrayNxW */ \
+ /** @see arrayNxW */ \
+struct r123array##_N##x##W{ \
+ T v[_N]; \
+ CXXMETHODS(_N, W, T) \
+}; \
+ \
+CXXOVERLOADS(_N, W, T)
+
+/** @endcond */
+
+_r123array_tpl(1, 32, uint32_t) /* r123array1x32 */
+_r123array_tpl(2, 32, uint32_t) /* r123array2x32 */
+_r123array_tpl(4, 32, uint32_t) /* r123array4x32 */
+_r123array_tpl(8, 32, uint32_t) /* r123array8x32 */
+
+_r123array_tpl(1, 64, uint64_t) /* r123array1x64 */
+_r123array_tpl(2, 64, uint64_t) /* r123array2x64 */
+_r123array_tpl(4, 64, uint64_t) /* r123array4x64 */
+
+_r123array_tpl(16, 8, uint8_t) /* r123array16x8 for ARSsw, AESsw */
+
+#if R123_USE_SSE
+_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */
+#endif
+
+/* In C++, it's natural to use sizeof(a::value_type), but in C it's
+ pretty convoluted to figure out the width of the value_type of an
+ r123arrayNxW:
+*/
+#define R123_W(a) (8*sizeof(((a *)0)->v[0]))
+
+/** @namespace r123
+ Most of the Random123 C++ API is contained in the r123 namespace.
+*/
+
+#endif
+
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __clangfeatures_dot_hpp
+#define __clangfeatures_dot_hpp
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H ((defined(__x86_64__)||defined(__i386__)))
+#endif
+
+#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
+#define R123_USE_CXX11_UNRESTRICTED_UNIONS __has_feature(cxx_unrestricted_unions)
+#endif
+
+#ifndef R123_USE_CXX11_STATIC_ASSERT
+#define R123_USE_CXX11_STATIC_ASSERT __has_feature(cxx_static_assert)
+#endif
+
+#ifndef R123_USE_CXX11_CONSTEXPR
+#define R123_USE_CXX11_CONSTEXPR __has_feature(cxx_constexpr)
+#endif
+
+#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
+#define R123_USE_CXX11_EXPLICIT_CONVERSIONS __has_feature(cxx_explicit_conversions)
+#endif
+
+// With clang-3.0, the apparently simpler:
+// #define R123_USE_CXX11_RANDOM __has_include(<random>)
+// dumps core.
+#ifndef R123_USE_CXX11_RANDOM
+#if __cplusplus>=201103L && __has_include(<random>)
+#define R123_USE_CXX11_RANDOM 1
+#else
+#define R123_USE_CXX11_RANDOM 0
+#endif
+#endif
+
+#ifndef R123_USE_CXX11_TYPE_TRAITS
+#if __cplusplus>=201103L && __has_include(<type_traits>)
+#define R123_USE_CXX11_TYPE_TRAITS 1
+#else
+#define R123_USE_CXX11_TYPE_TRAITS 0
+#endif
+#endif
+
+#include "gccfeatures.h"
+
+#endif
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/**
+
+@page porting Preprocessor symbols for porting Random123 to different platforms.
+
+The Random123 library is portable across C, C++, CUDA, OpenCL environments,
+and multiple operating systems (Linux, Windows 7, Mac OS X, FreeBSD, Solaris).
+This level of portability requires the abstraction of some features
+and idioms that are either not standardized (e.g., asm statments), or for which
+different vendors have their own standards (e.g., SSE intrinsics) or for
+which vendors simply refuse to conform to well-established standards (e.g., <inttypes.h>).
+
+Random123/features/compilerfeatures.h
+conditionally includes a compiler-or-OS-specific Random123/featires/XXXfeatures.h file which
+defines appropriate values for the preprocessor symbols which can be used with
+a specific compiler or OS. Those symbols will then
+be used by other header files and source files in the Random123
+library (and may be used by applications) to control what actually
+gets presented to the compiler.
+
+Most of the symbols are boolean valued. In general, they will
+\b always be defined with value either 1 or 0, so do
+\b NOT use \#ifdef. Use \#if R123_USE_SOMETHING instead.
+
+Library users can override any value by defining the pp-symbol with a compiler option,
+e.g.,
+
+ cc -DR123_USE_MULHILO64_C99
+
+will use a strictly c99 version of the full-width 64x64->128-bit multiplication
+function, even if it would be disabled by default.
+
+All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.h start with the prefix R123_USE_
+@verbatim
+ AES_NI
+ AES_OPENSSL
+ SSE4_2
+ SSE4_1
+ SSE
+
+ STD_RANDOM
+
+ GNU_UINT128
+ ASM_GNU
+ ASM_MSASM
+
+ CPUID_MSVC
+
+ CXX11_RANDOM
+ CXX11_TYPE_TRAITS
+ CXX11_STATIC_ASSERT
+ CXX11_CONSTEXPR
+ CXX11_UNRESTRICTED_UNIONS
+ CXX11_EXPLICIT_CONVERSIONS
+ CXX11_LONG_LONG
+ CXX11
+
+ X86INTRIN_H
+ IA32INTRIN_H
+ XMMINTRIN_H
+ EMMINTRIN_H
+ SMMINTRIN_H
+ WMMINTRIN_H
+ INTRIN_H
+
+ MULHILO32_ASM
+ MULHILO64_ASM
+ MULHILO64_MSVC_INTRIN
+ MULHILO64_CUDA_INTRIN
+ MULHILO64_OPENCL_INTRIN
+ MULHILO64_C99
+
+ U01_DOUBLE
+
+@endverbatim
+Most have obvious meanings. Some non-obvious ones:
+
+AES_NI and AES_OPENSSL are not mutually exclusive. You can have one,
+both or neither.
+
+GNU_UINT128 says that it's safe to use __uint128_t, but it
+does not require its use. In particular, it should be
+used in mulhilo<uint64_t> only if MULHILO64_ASM is unset.
+
+If the XXXINTRIN_H macros are true, then one should
+@code
+#include <xxxintrin.h>
+@endcode
+to gain accesss to compiler intrinsics.
+
+The CXX11_SOME_FEATURE macros allow the code to use specific
+features of the C++11 language and library. The catchall
+In the absence of a specific CXX11_SOME_FEATURE, the feature
+is controlled by the catch-all R123_USE_CXX11 macro.
+
+U01_DOUBLE defaults on, and can be turned off (set to 0)
+if one does not want the utility functions that convert to double
+(i.e. u01_*_53()), e.g. on OpenCL without the cl_khr_fp64 extension.
+
+There are a number of invariants that are always true. Application code may
+choose to rely on these:
+
+<ul>
+<li>ASM_GNU and ASM_MASM are mutually exclusive
+<li>The "higher" SSE values imply the lower ones.
+</ul>
+
+There are also non-boolean valued symbols:
+
+<ul>
+<li>R123_STATIC_INLINE -
+ According to both C99 and GNU99, the 'static inline' declaration allows
+ the compiler to not emit code if the function is not used.
+ Note that the semantics of 'inline', 'static' and 'extern' in
+ gcc have changed over time and are subject to modification by
+ command line options, e.g., -std=gnu89, -fgnu-inline.
+ Nevertheless, it appears that the meaning of 'static inline'
+ has not changed over time and (with a little luck) the use of 'static inline'
+ here will be portable between versions of gcc and to other C99
+ compilers.
+ See: http://gcc.gnu.org/onlinedocs/gcc/Inline.html
+ http://www.greenend.org.uk/rjk/2003/03/inline.html
+
+<li>R123_FORCE_INLINE(decl) -
+ which expands to 'decl', adorned with the compiler-specific
+ embellishments to strongly encourage that the declared function be
+ inlined. If there is no such compiler-specific magic, it should
+ expand to decl, unadorned.
+
+<li>R123_CUDA_DEVICE - which expands to __device__ (or something else with
+ sufficiently similar semantics) when CUDA is in use, and expands
+ to nothing in other cases.
+
+<li>R123_ASSERT(x) - which expands to assert(x), or maybe to nothing at
+ all if we're in an environment so feature-poor that you can't even
+ call assert (I'm looking at you, CUDA and OpenCL), or even include
+ assert.h safely (OpenCL).
+
+<li>R123_STATIC_ASSERT(expr,msg) - which expands to
+ static_assert(expr,msg), or to an expression that
+ will raise a compile-time exception if expr is not true.
+
+<li>R123_ULONG_LONG - which expands to a declaration of the longest available
+ unsigned integer.
+
+<li>R123_64BIT(x) - expands to something equivalent to
+ UINT64_C(x) from <stdint.h>, even in environments where <stdint.h>
+ is not available, e.g., MSVC and OpenCL.
+
+<li>R123_BUILTIN_EXPECT(expr,likely_value) - expands to something with
+ the semantics of gcc's __builtin_expect(expr,likely_value). If
+ the environment has nothing like __builtin_expect, it should expand
+ to just expr.
+</ul>
+
+
+\cond HIDDEN_FROM_DOXYGEN
+*/
+
+/*
+N.B. When something is added to the list of features, it should be
+added to each of the *features.h files, AND to examples/ut_features.cpp.
+*/
+
+/* N.B. most other compilers (icc, nvcc, open64, llvm) will also define __GNUC__, so order matters. */
+#if defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0
+#include "openclfeatures.h"
+#elif defined(__CUDACC__)
+#include "nvccfeatures.h"
+#elif defined(__ICC)
+#include "iccfeatures.h"
+#elif defined(__xlC__)
+#include "xlcfeatures.h"
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+#include "sunprofeatures.h"
+#elif defined(__OPEN64__)
+#include "open64features.h"
+#elif defined(__clang__)
+#include "clangfeatures.h"
+#elif defined(__GNUC__)
+#include "gccfeatures.h"
+#elif defined(__PGI)
+#include "pgccfeatures.h"
+#elif defined(_MSC_FULL_VER)
+#include "msvcfeatures.h"
+#else
+#error "Can't identify compiler. You'll need to add a new xxfeatures.hpp"
+{ /* maybe an unbalanced brace will terminate the compilation */
+#endif
+
+#ifndef R123_USE_CXX11
+#define R123_USE_CXX11 (__cplusplus >= 201103L)
+#endif
+
+#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
+#define R123_USE_CXX11_UNRESTRICTED_UNIONS R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_STATIC_ASSERT
+#define R123_USE_CXX11_STATIC_ASSERT R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_CONSTEXPR
+#define R123_USE_CXX11_CONSTEXPR R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
+#define R123_USE_CXX11_EXPLICIT_CONVERSIONS R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_RANDOM
+#define R123_USE_CXX11_RANDOM R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_TYPE_TRAITS
+#define R123_USE_CXX11_TYPE_TRAITS R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_LONG_LONG
+#define R123_USE_CXX11_LONG_LONG R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_MULHILO64_C99
+#define R123_USE_MULHILO64_C99 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MULHI_INTRIN
+#define R123_USE_MULHILO64_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO32_MULHI_INTRIN
+#define R123_USE_MULHILO32_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_STATIC_ASSERT
+#if R123_USE_CXX11_STATIC_ASSERT
+#define R123_STATIC_ASSERT(expr, msg) static_assert(expr, msg)
+#else
+ /* if msg always_looked_like_this, we could paste it into the name. Worth it? */
+#define R123_STATIC_ASSERT(expr, msg) typedef char static_assertion[(!!(expr))*2-1]
+#endif
+#endif
+
+#ifndef R123_CONSTEXPR
+#if R123_USE_CXX11_CONSTEXPR
+#define R123_CONSTEXPR constexpr
+#else
+#define R123_CONSTEXPR
+#endif
+#endif
+
+#ifndef R123_USE_PHILOX_64BIT
+#define R123_USE_PHILOX_64BIT (R123_USE_MULHILO64_ASM || R123_USE_MULHILO64_MSVC_INTRIN || R123_USE_MULHILO64_CUDA_INTRIN || R123_USE_GNU_UINT128 || R123_USE_MULHILO64_C99 || R123_USE_MULHILO64_OPENCL_INTRIN || R123_USE_MULHILO64_MULHI_INTRIN)
+#endif
+
+#ifndef R123_ULONG_LONG
+#if defined(__cplusplus) && !R123_USE_CXX11_LONG_LONG
+/* C++98 doesn't have long long. It doesn't have uint64_t either, but
+ we will have typedef'ed uint64_t to something in the xxxfeatures.h.
+ With luck, it won't elicit complaints from -pedantic. Cross your
+ fingers... */
+#define R123_ULONG_LONG uint64_t
+#else
+#define R123_ULONG_LONG unsigned long long
+#endif
+#endif
+
+/* UINT64_C should have been #defined by XXXfeatures.h, either by
+ #include <stdint.h> or through compiler-dependent hacks */
+#ifndef R123_64BIT
+#define R123_64BIT(x) UINT64_C(x)
+#endif
+
+#ifndef R123_THROW
+#define R123_THROW(x) throw (x)
+#endif
+
+/*
+ * Windows.h (and perhaps other "well-meaning" code define min and
+ * max, so there's a high chance that our definition of min, max
+ * methods or use of std::numeric_limits min and max will cause
+ * complaints in any program that happened to include Windows.h or
+ * suchlike first. We use the null macro below in our own header
+ * files definition or use of min, max to defensively preclude
+ * this problem. It may not be enough; one might need to #define
+ * NOMINMAX before including Windows.h or compile with -DNOMINMAX.
+ */
+#define R123_NO_MACRO_SUBST
+
+/** \endcond */
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __gccfeatures_dot_hpp
+#define __gccfeatures_dot_hpp
+
+#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+
+#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__)
+# error "This code has only been tested on x86 and powerpc platforms."
+#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
+{ /* maybe an unbalanced brace will terminate the compilation */
+ /* Feel free to try the Random123 library on other architectures by changing
+ the conditions that reach this error, but you should consider it a
+ porting exercise and expect to encounter bugs and deficiencies.
+ Please let the authors know of any successes (or failures). */
+#endif
+
+#ifdef __powerpc__
+#include <ppu_intrinsics.h>
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static __inline__
+#endif
+
+#ifndef R123_FORCE_INLINE
+#if R123_GNUC_VERSION >= 40000
+#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
+#else
+#define R123_FORCE_INLINE(decl) decl
+#endif
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
+#endif
+
+/* According to the C++0x standard, we should be able to test the numeric
+ value of __cplusplus == 199701L for C++98, __cplusplus == 201103L for C++0x
+ But gcc has had an open bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=1773
+ since early 2001, which was finally fixed in 4.7 (early 2012). For
+ earlier versions, the only way to detect whether --std=c++0x was requested
+ on the command line is to look at the __GCC_EXPERIMENTAL_CXX0X__ pp-symbol.
+*/
+#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && defined(__GCC_EXPERIMENTAL_CXX0X__) ))
+
+#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
+#define R123_USE_CXX11_UNRESTRICTED_UNIONS ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_STATIC_ASSERT
+#define R123_USE_CXX11_STATIC_ASSERT ((R123_GNUC_VERSION >= 40300) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_CONSTEXPR
+#define R123_USE_CXX11_CONSTEXPR ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
+#define R123_USE_CXX11_EXPLICIT_CONVERSIONS ((R123_GNUC_VERSION >= 40500) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_RANDOM
+#define R123_USE_CXX11_RANDOM ((R123_GNUC_VERSION>=40500) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_TYPE_TRAITS
+#define R123_USE_CXX11_TYPE_TRAITS ((R123_GNUC_VERSION>=40400) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_AES_NI
+#ifdef __AES__
+#define R123_USE_AES_NI 1
+#else
+#define R123_USE_AES_NI 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_2
+#ifdef __SSE4_2__
+#define R123_USE_SSE4_2 1
+#else
+#define R123_USE_SSE4_2 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_1
+#ifdef __SSE4_1__
+#define R123_USE_SSE4_1 1
+#else
+#define R123_USE_SSE4_1 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE
+/* There's no point in trying to compile SSE code in Random123
+ unless SSE2 is available. */
+#ifdef __SSE2__
+#define R123_USE_SSE 1
+#else
+#define R123_USE_SSE 0
+#endif
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+/* There isn't really a good way to tell at compile time whether
+ openssl is available. Without a pre-compilation configure-like
+ tool, it's less error-prone to guess that it isn't available. Add
+ -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
+ play with openssl */
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#ifdef __x86_64__
+#define R123_USE_GNU_UINT128 1
+#else
+#define R123_USE_GNU_UINT128 0
+#endif
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU (defined(__x86_64__)||defined(__i386__))
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H ((defined(__x86_64__)||defined(__i386__)) && R123_GNUC_VERSION >= 40402)
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+/* gcc -m64 on Solaris 10 defines __SSE2__ but doesn't have
+ emmintrin.h in the include search path. This is
+ so broken that I refuse to try to work around it. If this
+ affects you, figure out where your emmintrin.h lives and
+ add an appropriate -I to your CPPFLAGS. Or add -DR123_USE_SSE=0. */
+#define R123_USE_EMMINTRIN_H (R123_USE_SSE && (R123_GNUC_VERSION < 40402))
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H ((R123_USE_SSE4_1 || R123_USE_SSE4_2) && (R123_GNUC_VERSION < 40402))
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#define R123_USE_INTRIN_H 0
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MULHI_INTRIN
+#define R123_USE_MULHILO64_MULHI_INTRIN (defined(__powerpc64__))
+#endif
+
+#ifndef R123_MULHILO64_MULHI_INTRIN
+#define R123_MULHILO64_MULHI_INTRIN __mulhdu
+#endif
+
+#ifndef R123_USE_MULHILO32_MULHI_INTRIN
+#define R123_USE_MULHILO32_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_MULHILO32_MULHI_INTRIN
+#define R123_MULHILO32_MULHI_INTRIN __mulhwu
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+/* If you add something, it must go in all the other XXfeatures.hpp
+ and in ../ut_features.cpp */
+#endif
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __icpcfeatures_dot_hpp
+#define __icpcfeatures_dot_hpp
+
+// icc relies on gcc libraries and other toolchain components.
+#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+
+#if !defined(__x86_64__) && !defined(__i386__)
+# error "This code has only been tested on x86 platforms."
+{ // maybe an unbalanced brace will terminate the compilation
+// You are invited to try Easy123 on other architectures, by changing
+// the conditions that reach this error, but you should consider it a
+// porting exercise and expect to encounter bugs and deficiencies.
+// Please let the authors know of any successes (or failures).
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
+#endif
+
+// The basic idiom is:
+// #ifndef R123_SOMETHING
+// #if some condition
+// #define R123_SOMETHING 1
+// #else
+// #define R123_SOMETHING 0
+// #endif
+// #endif
+// This idiom allows an external user to override any decision
+// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
+
+// An alternative idiom is:
+// #ifndef R123_SOMETHING
+// #define R123_SOMETHING (some boolean expression)
+// #endif
+// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
+// pp-symbols.
+
+#ifndef R123_USE_SSE4_2
+#ifdef __SSE4_2__
+#define R123_USE_SSE4_2 1
+#else
+#define R123_USE_SSE4_2 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_1
+#ifdef __SSE4_1__
+#define R123_USE_SSE4_1 1
+#else
+#define R123_USE_SSE4_1 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE
+#ifdef __SSE2__
+#define R123_USE_SSE 1
+#else
+#define R123_USE_SSE 0
+#endif
+#endif
+
+#ifndef R123_USE_AES_NI
+// Unlike gcc, icc (version 12) does not pre-define an __AES__
+// pp-symbol when -maes or -xHost is on the command line. This feels
+// like a defect in icc (it defines __SSE4_2__ in analogous
+// circumstances), but until Intel fixes it, we're better off erring
+// on the side of caution and not generating instructions that are
+// going to raise SIGILL when executed. To get the AES-NI
+// instructions with icc, the caller must puts something like
+// -DR123_USE_AES_NI=1 or -D__AES__ on the command line. FWIW, the
+// AES-NI Whitepaper by Gueron says that icc has supported AES-NI from
+// 11.1 onwards.
+//
+#define R123_USE_AES_NI ((__ICC>=1101) && defined(__AES__))
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+/* There isn't really a good way to tell at compile time whether
+ openssl is available. Without a pre-compilation configure-like
+ tool, it's less error-prone to guess that it isn't available. Add
+ -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
+ play with openssl */
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 1
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 1
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#define R123_USE_INTRIN_H 0
+#endif
+
+#ifndef R123_USE_MULHILO16_ASM
+#define R123_USE_MULHILO16_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 1
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+// If you add something, it must go in all the other XXfeatures.hpp
+// and in ../ut_features.cpp
+#endif
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __msvcfeatures_dot_hpp
+#define __msvcfeatures_dot_hpp
+
+//#if _MSVC_FULL_VER <= 15
+//#error "We've only tested MSVC_FULL_VER==15."
+//#endif
+
+#if !defined(_M_IX86) && !defined(_M_X64)
+# error "This code has only been tested on x86 platforms."
+{ // maybe an unbalanced brace will terminate the compilation
+// You are invited to try Random123 on other architectures, by changing
+// the conditions that reach this error, but you should consider it a
+// porting exercise and expect to encounter bugs and deficiencies.
+// Please let the authors know of any successes (or failures).
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static __inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) _forceinline decl
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+// The basic idiom is:
+// #ifndef R123_SOMETHING
+// #if some condition
+// #define R123_SOMETHING 1
+// #else
+// #define R123_SOMETHING 0
+// #endif
+// #endif
+// This idiom allows an external user to override any decision
+// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
+
+// An alternative idiom is:
+// #ifndef R123_SOMETHING
+// #define R123_SOMETHING (some boolean expression)
+// #endif
+// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
+// pp-symbols.
+
+#ifndef R123_USE_AES_NI
+#if defined(_M_X64)
+#define R123_USE_AES_NI 1
+#else
+#define R123_USE_AES_NI 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_2
+#if defined(_M_X64)
+#define R123_USE_SSE4_2 1
+#else
+#define R123_USE_SSE4_2 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_1
+#if defined(_M_X64)
+#define R123_USE_SSE4_1 1
+#else
+#define R123_USE_SSE4_1 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE
+#define R123_USE_SSE 1
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 0
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 1
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#define R123_USE_INTRIN_H 1
+#endif
+
+#ifndef R123_USE_MULHILO16_ASM
+#define R123_USE_MULHILO16_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#if defined(_M_X64)
+#define R123_USE_MULHILO64_MSVC_INTRIN 1
+#else
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+#pragma warning(disable:4244)
+#pragma warning(disable:4996)
+
+// If you add something, it must go in all the other XXfeatures.hpp
+// and in ../ut_features.cpp
+#endif
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __r123_nvcc_features_dot_h__
+#define __r123_nvcc_features_dot_h__
+
+#if !defined(CUDART_VERSION)
+#error "why are we in nvccfeatures.h if CUDART_VERSION is not defined"
+#endif
+
+#if CUDART_VERSION < 4010
+#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces. Random123 isunsupported. See comments in nvccfeatures.h"
+// This test was added in Random123-1.08 (August, 2013) because we
+// discovered that Ftype(maxTvalue<T>()) with Ftype=double and
+// T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and
+// earlier. We can't be sure this bug doesn't also affect invocations
+// of other templated functions, e.g., essentially all of Random123.
+// Thus, we no longer trust CUDA versions earlier than 4.1 even though
+// we had previously tested and timed Random123 with CUDA 3.x and 4.0.
+// If you feel lucky or desperate, you can change #error to #warning, but
+// please take extra care to be sure that you are getting correct
+// results.
+#endif
+
+// nvcc falls through to gcc or msvc. So first define
+// a couple of things and then include either gccfeatures.h
+// or msvcfeatures.h
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE __device__
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 1
+#endif
+
+#ifndef R123_ASSERT
+#define R123_ASSERT(x) if((x)) ; else asm("trap;")
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+#ifndef R123_USE_SSE4_2
+#define R123_USE_SSE4_2 0
+#endif
+
+#ifndef R123_USE_SSE4_1
+#define R123_USE_SSE4_1 0
+#endif
+
+#ifndef R123_USE_SSE
+#define R123_USE_SSE 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_ULONG_LONG
+// uint64_t, which is what we'd get without this, is
+// not the same as unsigned long long
+#define R123_ULONG_LONG unsigned long long
+#endif
+
+#ifndef R123_THROW
+// No exceptions in CUDA, at least upto 4.0
+#define R123_THROW(x) R123_ASSERT(0)
+#endif
+
+#if defined(__GNUC__)
+#include "gccfeatures.h"
+#elif defined(_MSC_FULL_VER)
+#include "msvcfeatures.h"
+#endif
+
+#endif
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __open64features_dot_hpp
+#define __open64features_dot_hpp
+
+/* The gcc features are mostly right. We just override a few and then include gccfeatures.h */
+
+/* Open64 4.2.3 and 4.2.4 accept the __uint128_t code without complaint
+ but produce incorrect code for 64-bit philox. The MULHILO64_ASM
+ seems to work fine */
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 1
+#endif
+
+#include "gccfeatures.h"
+
+#endif
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __openclfeatures_dot_hpp
+#define __openclfeatures_dot_hpp
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#define R123_ASSERT(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 1
+#endif
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+// XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of
+// ulong to mul_hi. And gets lots of complaints from stdint.h
+// on some machines.
+// But these typedefs mean we cannot include stdint.h with
+// these headers? Do we need R123_64T, R123_32T, R123_8T?
+typedef ulong uint64_t;
+typedef uint uint32_t;
+typedef uchar uint8_t;
+#define UINT64_C(x) ((ulong)(x##UL))
+
+#endif
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Copyright (c) 2013, Los Alamos National Security, LLC
+All rights reserved.
+
+Copyright 2013. Los Alamos National Security, LLC. This software was produced
+under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
+the U.S. Department of Energy. The U.S. Government has rights to use,
+reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+to produce derivative works, such modified software should be clearly marked,
+so as not to confuse it with the version available from LANL.
+*/
+#ifndef __pgccfeatures_dot_hpp
+#define __pgccfeatures_dot_hpp
+
+#if !defined(__x86_64__) && !defined(__i386__)
+# error "This code has only been tested on x86 platforms."
+#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
+{ /* maybe an unbalanced brace will terminate the compilation */
+ /* Feel free to try the Random123 library on other architectures by changing
+ the conditions that reach this error, but you should consider it a
+ porting exercise and expect to encounter bugs and deficiencies.
+ Please let the authors know of any successes (or failures). */
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static inline
+#endif
+
+/* Found this example in PGI's emmintrin.h. */
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) (expr)
+#endif
+
+/* PGI through 13.2 doesn't appear to support AES-NI. */
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+/* PGI through 13.2 appears to support MMX, SSE, SSE3, SSE3, SSSE3, SSE4a, and
+ ABM, but not SSE4.1 or SSE4.2. */
+#ifndef R123_USE_SSE4_2
+#define R123_USE_SSE4_2 0
+#endif
+
+#ifndef R123_USE_SSE4_1
+#define R123_USE_SSE4_1 0
+#endif
+
+#ifndef R123_USE_SSE
+/* There's no point in trying to compile SSE code in Random123
+ unless SSE2 is available. */
+#ifdef __SSE2__
+#define R123_USE_SSE 1
+#else
+#define R123_USE_SSE 0
+#endif
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+/* There isn't really a good way to tell at compile time whether
+ openssl is available. Without a pre-compilation configure-like
+ tool, it's less error-prone to guess that it isn't available. Add
+ -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
+ play with openssl */
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 1
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+/* emmintrin.h from PGI #includes xmmintrin.h but then complains at link time
+ about undefined references to _mm_castsi128_ps(__m128i). Why? */
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#ifdef __ABM__
+#define R123_USE_INTRIN_H 1
+#else
+#define R123_USE_INTRIN_H 0
+#endif
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MULHI_INTRIN
+#define R123_USE_MULHILO64_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 1
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+/* If you add something, it must go in all the other XXfeatures.hpp
+ and in ../ut_features.cpp */
+#endif
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _Random123_sse_dot_h__
+#define _Random123_sse_dot_h__
+
+#if R123_USE_SSE
+
+#if R123_USE_X86INTRIN_H
+#include <x86intrin.h>
+#endif
+#if R123_USE_IA32INTRIN_H
+#include <ia32intrin.h>
+#endif
+#if R123_USE_XMMINTRIN_H
+#include <xmmintrin.h>
+#endif
+#if R123_USE_EMMINTRIN_H
+#include <emmintrin.h>
+#endif
+#if R123_USE_SMMINTRIN_H
+#include <smmintrin.h>
+#endif
+#if R123_USE_WMMINTRIN_H
+#include <wmmintrin.h>
+#endif
+#if R123_USE_INTRIN_H
+#include <intrin.h>
+#endif
+#ifdef __cplusplus
+#include <iostream>
+#include <limits>
+#include <stdexcept>
+#endif
+
+#if R123_USE_ASM_GNU
+
+/* bit25 of CX tells us whether AES is enabled. */
+R123_STATIC_INLINE int haveAESNI(){
+ unsigned int eax, ebx, ecx, edx;
+ __asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
+ "a" (1));
+ return (ecx>>25) & 1;
+}
+#elif R123_USE_CPUID_MSVC
+R123_STATIC_INLINE int haveAESNI(){
+ int CPUInfo[4];
+ __cpuid(CPUInfo, 1);
+ return (CPUInfo[2]>>25)&1;
+}
+#else /* R123_USE_CPUID_??? */
+#warning "No R123_USE_CPUID_XXX method chosen. haveAESNI will always return false"
+R123_STATIC_INLINE int haveAESNI(){
+ return 0;
+}
+#endif /* R123_USE_ASM_GNU || R123_USE_CPUID_MSVC */
+
+// There is a lot of annoying and inexplicable variation in the
+// SSE intrinsics available in different compilation environments.
+// The details seem to depend on the compiler, the version and
+// the target architecture. Rather than insisting on
+// R123_USE_feature tests for each of these in each of the
+// compilerfeatures.h files we just keep the complexity localized
+// to here...
+#if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
+/* Is there an intrinsic to assemble an __m128i from two 64-bit words?
+ If not, use the 4x32-bit intrisic instead. N.B. It looks like Intel
+ added _mm_set_epi64x to icc version 12.1 in Jan 2012.
+*/
+R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
+ union{
+ uint64_t u64;
+ uint32_t u32[2];
+ } u1, u0;
+ u1.u64 = v1;
+ u0.u64 = v0;
+ return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
+}
+#endif
+/* _mm_extract_lo64 abstracts the task of extracting the low 64-bit
+ word from an __m128i. The _mm_cvtsi128_si64 intrinsic does the job
+ on 64-bit platforms. Unfortunately, both MSVC and Open64 fail
+ assertions in ut_M128.cpp and ut_carray.cpp when we use the
+ _mm_cvtsi128_si64 intrinsic. (See
+ https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug).
+ On 32-bit platforms, there's no MOVQ, so there's no intrinsic.
+ Finally, even if the intrinsic exists, it may be spelled with or
+ without the 'x'.
+*/
+#if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
+R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
+ union{
+ uint64_t u64[2];
+ __m128i m;
+ }u;
+ _mm_store_si128(&u.m, si);
+ return u.u64[0];
+}
+#elif defined(__llvm__) || defined(__ICC)
+R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
+ return (uint64_t)_mm_cvtsi128_si64(si);
+}
+#else /* GNUC, others */
+/* FWIW, gcc's emmintrin.h has had the 'x' spelling
+ since at least gcc-3.4.4. The no-'x' spelling showed up
+ around 4.2. */
+R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
+ return (uint64_t)_mm_cvtsi128_si64x(si);
+}
+#endif
+#if defined(__GNUC__) && __GNUC__ < 4
+/* the cast builtins showed up in gcc4. */
+R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
+ return (__m128)si;
+}
+#endif
+
+#ifdef __cplusplus
+
+struct r123m128i{
+ __m128i m;
+#if R123_USE_CXX11_UNRESTRICTED_UNIONS
+ // C++98 forbids a union member from having *any* constructors.
+ // C++11 relaxes this, and allows union members to have constructors
+ // as long as there is a "trivial" default construtor. So in C++11
+ // we can provide a r123m128i constructor with an __m128i argument, and still
+ // have the default (and hence trivial) default constructor.
+ r123m128i() = default;
+ r123m128i(__m128i _m): m(_m){}
+#endif
+ r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
+ r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
+#if R123_USE_CXX11_EXPLICIT_CONVERSIONS
+ // With C++0x we can attach explicit to the bool conversion operator
+ // to disambiguate undesired promotions. For g++, this works
+ // only in 4.5 and above.
+ explicit operator bool() const {return _bool();}
+#else
+ // Pre-C++0x, we have to do something else. Google for the "safe bool"
+ // idiom for other ideas...
+ operator const void*() const{return _bool()?this:0;}
+#endif
+ operator __m128i() const {return m;}
+
+private:
+#if R123_USE_SSE4_1
+ bool _bool() const{ return !_mm_testz_si128(m,m); }
+#else
+ bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
+#endif
+};
+
+R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
+ __m128i& c = v.m;
+ __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
+ c = _mm_add_epi64(c, zeroone);
+ //return c;
+#if R123_USE_SSE4_1
+ __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
+ if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
+ __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
+ c = _mm_add_epi64(c, onezero);
+ }
+#else
+ unsigned mask = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
+ // The low two bits of mask are 11 iff the low 64 bits of
+ // c are zero.
+ if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
+ __m128i onezero = _mm_set_epi64x(1,0);
+ c = _mm_add_epi64(c, onezero);
+ }
+#endif
+ return v;
+}
+
+R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){
+ __m128i c = lhs.m;
+ __m128i incr128 = _mm_set_epi64x(0, n);
+ c = _mm_add_epi64(c, incr128);
+ // return c; // NO CARRY!
+
+ int64_t lo64 = _mm_extract_lo64(c);
+ if((uint64_t)lo64 < n)
+ c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
+ lhs.m = c;
+ return lhs;
+}
+
+// We need this one because it's present, but never used in r123array1xm128i::incr
+R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){
+ throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
+
+// The comparisons aren't implemented, but if we leave them out, and
+// somebody writes, e.g., M1 < M2, the compiler will do an implicit
+// conversion through void*. Sigh...
+R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){
+ throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
+R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){
+ throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
+R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){
+ throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
+R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){
+ throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
+
+R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){
+ return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
+R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){
+ return !(lhs==rhs);}
+R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
+ r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
+R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
+ return !(lhs==rhs);}
+R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
+ union{
+ uint64_t u64[2];
+ __m128i m;
+ }u;
+ _mm_storeu_si128(&u.m, m.m);
+ return os << u.u64[0] << " " << u.u64[1];
+}
+
+R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
+ uint64_t u64[2];
+ is >> u64[0] >> u64[1];
+ m.m = _mm_set_epi64x(u64[1], u64[0]);
+ return is;
+}
+
+template<typename T> inline T assemble_from_u32(uint32_t *p32); // forward declaration
+
+template <>
+inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
+ r123m128i ret;
+ ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
+ return ret;
+}
+
+#else
+
+typedef struct {
+ __m128i m;
+} r123m128i;
+
+#endif /* __cplusplus */
+
+#else /* !R123_USE_SSE */
+R123_STATIC_INLINE int haveAESNI(){
+ return 0;
+}
+#endif /* R123_USE_SSE */
+
+#endif /* _Random123_sse_dot_h__ */
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __sunprofeatures_dot_hpp
+#define __sunprofeatures_dot_hpp
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+// The basic idiom is:
+// #ifndef R123_SOMETHING
+// #if some condition
+// #define R123_SOMETHING 1
+// #else
+// #define R123_SOMETHING 0
+// #endif
+// #endif
+// This idiom allows an external user to override any decision
+// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
+
+// An alternative idiom is:
+// #ifndef R123_SOMETHING
+// #define R123_SOMETHING (some boolean expression)
+// #endif
+// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
+// pp-symbols.
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+#ifndef R123_USE_SSE4_2
+#define R123_USE_SSE4_2 0
+#endif
+
+#ifndef R123_USE_SSE4_1
+#define R123_USE_SSE4_1 0
+#endif
+
+#ifndef R123_USE_SSE
+#define R123_USE_SSE 0
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 0
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#define R123_USE_INTRIN_H 0
+#endif
+
+#ifndef R123_USE_MULHILO16_ASM
+#define R123_USE_MULHILO16_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef R123_USE_PHILOX_64BIT
+#define R123_USE_PHILOX_64BIT 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+// If you add something, it must go in all the other XXfeatures.hpp
+// and in ../ut_features.cpp
+#endif
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Copyright (c) 2013, Los Alamos National Security, LLC
+All rights reserved.
+
+Copyright 2013. Los Alamos National Security, LLC. This software was produced
+under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
+the U.S. Department of Energy. The U.S. Government has rights to use,
+reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+to produce derivative works, such modified software should be clearly marked,
+so as not to confuse it with the version available from LANL.
+*/
+#ifndef __xlcfeatures_dot_hpp
+#define __xlcfeatures_dot_hpp
+
+#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__)
+# error "This code has only been tested on x86 and PowerPC platforms."
+#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
+{ /* maybe an unbalanced brace will terminate the compilation */
+ /* Feel free to try the Random123 library on other architectures by changing
+ the conditions that reach this error, but you should consider it a
+ porting exercise and expect to encounter bugs and deficiencies.
+ Please let the authors know of any successes (or failures). */
+#endif
+
+#ifdef __cplusplus
+/* builtins are automatically available to xlc. To use them with xlc++,
+ one must include builtins.h. c.f
+ http://publib.boulder.ibm.com/infocenter/cellcomp/v101v121/index.jsp?topic=/com.ibm.xlcpp101.cell.doc/compiler_ref/compiler_builtins.html
+*/
+#include <builtins.h>
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
+#endif
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+#ifndef R123_USE_SSE4_2
+#define R123_USE_SSE4_2 0
+#endif
+
+#ifndef R123_USE_SSE4_1
+#define R123_USE_SSE4_1 0
+#endif
+
+#ifndef R123_USE_SSE
+#define R123_USE_SSE 0
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+/* There isn't really a good way to tell at compile time whether
+ openssl is available. Without a pre-compilation configure-like
+ tool, it's less error-prone to guess that it isn't available. Add
+ -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
+ play with openssl */
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 1
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#ifdef __ABM__
+#define R123_USE_INTRIN_H 1
+#else
+#define R123_USE_INTRIN_H 0
+#endif
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MULHI_INTRIN
+#define R123_USE_MULHILO64_MULHI_INTRIN (defined(__powerpc64__))
+#endif
+
+#ifndef R123_MULHILO64_MULHI_INTRIN
+#define R123_MULHILO64_MULHI_INTRIN __mulhdu
+#endif
+
+#ifndef R123_USE_MULHILO32_MULHI_INTRIN
+#define R123_USE_MULHILO32_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_MULHILO32_MULHI_INTRIN
+#define R123_MULHILO32_MULHI_INTRIN __mulhwu
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM (defined(__powerpc64__) && !(R123_USE_MULHILO64_MULHI_INTRIN))
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+/* If you add something, it must go in all the other XXfeatures.hpp
+ and in ../ut_features.cpp */
+#endif
--- /dev/null
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _threefry_dot_h_
+#define _threefry_dot_h_
+#include "features/compilerfeatures.h"
+#include "array.h"
+
+/** \cond HIDDEN_FROM_DOXYGEN */
+/* Significant parts of this file were copied from
+ from:
+ Skein_FinalRnd/ReferenceImplementation/skein.h
+ Skein_FinalRnd/ReferenceImplementation/skein_block.c
+
+ in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
+
+ This file has been modified so that it may no longer perform its originally
+ intended function. If you're looking for a Skein or Threefish source code,
+ please consult the original file.
+
+ The original file had the following header:
+**************************************************************************
+**
+** Interface declarations and internal definitions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+***************************************************************************
+
+*/
+
+/* See comment at the top of philox.h for the macro pre-process
+ strategy. */
+
+/* Rotation constants: */
+enum r123_enum_threefry64x4 {
+ /* These are the R_256 constants from the Threefish reference sources
+ with names changed to R_64x4... */
+ R_64x4_0_0=14, R_64x4_0_1=16,
+ R_64x4_1_0=52, R_64x4_1_1=57,
+ R_64x4_2_0=23, R_64x4_2_1=40,
+ R_64x4_3_0= 5, R_64x4_3_1=37,
+ R_64x4_4_0=25, R_64x4_4_1=33,
+ R_64x4_5_0=46, R_64x4_5_1=12,
+ R_64x4_6_0=58, R_64x4_6_1=22,
+ R_64x4_7_0=32, R_64x4_7_1=32
+};
+
+enum r123_enum_threefry64x2 {
+ /*
+ // Output from skein_rot_search: (srs64_B64-X1000)
+ // Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57
+ // Start: Tue Mar 1 10:07:48 2011
+ // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
+ */
+ R_64x2_0_0=16,
+ R_64x2_1_0=42,
+ R_64x2_2_0=12,
+ R_64x2_3_0=31,
+ R_64x2_4_0=16,
+ R_64x2_5_0=32,
+ R_64x2_6_0=24,
+ R_64x2_7_0=21
+ /* 4 rounds: minHW = 4 [ 4 4 4 4 ]
+ // 5 rounds: minHW = 8 [ 8 8 8 8 ]
+ // 6 rounds: minHW = 16 [ 16 16 16 16 ]
+ // 7 rounds: minHW = 32 [ 32 32 32 32 ]
+ // 8 rounds: minHW = 64 [ 64 64 64 64 ]
+ // 9 rounds: minHW = 64 [ 64 64 64 64 ]
+ //10 rounds: minHW = 64 [ 64 64 64 64 ]
+ //11 rounds: minHW = 64 [ 64 64 64 64 ] */
+};
+
+enum r123_enum_threefry32x4 {
+ /* Output from skein_rot_search: (srs-B128-X5000.out)
+ // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
+ // Start: Mon Aug 24 22:41:36 2009
+ // ...
+ // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */
+ R_32x4_0_0=10, R_32x4_0_1=26,
+ R_32x4_1_0=11, R_32x4_1_1=21,
+ R_32x4_2_0=13, R_32x4_2_1=27,
+ R_32x4_3_0=23, R_32x4_3_1= 5,
+ R_32x4_4_0= 6, R_32x4_4_1=20,
+ R_32x4_5_0=17, R_32x4_5_1=11,
+ R_32x4_6_0=25, R_32x4_6_1=10,
+ R_32x4_7_0=18, R_32x4_7_1=20
+
+ /* 4 rounds: minHW = 3 [ 3 3 3 3 ]
+ // 5 rounds: minHW = 7 [ 7 7 7 7 ]
+ // 6 rounds: minHW = 12 [ 13 12 13 12 ]
+ // 7 rounds: minHW = 22 [ 22 23 22 23 ]
+ // 8 rounds: minHW = 31 [ 31 31 31 31 ]
+ // 9 rounds: minHW = 32 [ 32 32 32 32 ]
+ //10 rounds: minHW = 32 [ 32 32 32 32 ]
+ //11 rounds: minHW = 32 [ 32 32 32 32 ] */
+
+};
+
+enum r123_enum_threefry32x2 {
+ /* Output from skein_rot_search (srs32x2-X5000.out)
+ // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
+ // Start: Tue Jul 12 11:11:33 2011
+ // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */
+ R_32x2_0_0=13,
+ R_32x2_1_0=15,
+ R_32x2_2_0=26,
+ R_32x2_3_0= 6,
+ R_32x2_4_0=17,
+ R_32x2_5_0=29,
+ R_32x2_6_0=16,
+ R_32x2_7_0=24
+
+ /* 4 rounds: minHW = 4 [ 4 4 4 4 ]
+ // 5 rounds: minHW = 6 [ 6 8 6 8 ]
+ // 6 rounds: minHW = 9 [ 9 12 9 12 ]
+ // 7 rounds: minHW = 16 [ 16 24 16 24 ]
+ // 8 rounds: minHW = 32 [ 32 32 32 32 ]
+ // 9 rounds: minHW = 32 [ 32 32 32 32 ]
+ //10 rounds: minHW = 32 [ 32 32 32 32 ]
+ //11 rounds: minHW = 32 [ 32 32 32 32 ] */
+ };
+
+enum r123_enum_threefry_wcnt {
+ WCNT2=2,
+ WCNT4=4
+};
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
+R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
+{
+ return (x << (N & 63)) | (x >> ((64-N) & 63));
+}
+
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
+R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
+{
+ return (x << (N & 31)) | (x >> ((32-N) & 31));
+}
+
+#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
+#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+#define SKEIN_KS_PARITY32 0x1BD11BDA
+
+#ifndef THREEFRY2x32_DEFAULT_ROUNDS
+#define THREEFRY2x32_DEFAULT_ROUNDS 20
+#endif
+
+#ifndef THREEFRY2x64_DEFAULT_ROUNDS
+#define THREEFRY2x64_DEFAULT_ROUNDS 20
+#endif
+
+#ifndef THREEFRY4x32_DEFAULT_ROUNDS
+#define THREEFRY4x32_DEFAULT_ROUNDS 20
+#endif
+
+#ifndef THREEFRY4x64_DEFAULT_ROUNDS
+#define THREEFRY4x64_DEFAULT_ROUNDS 20
+#endif
+
+#define _threefry2x_tpl(W) \
+typedef struct r123array2x##W threefry2x##W##_ctr_t; \
+typedef struct r123array2x##W threefry2x##W##_key_t; \
+typedef struct r123array2x##W threefry2x##W##_ukey_t; \
+R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE \
+threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
+ threefry2x##W##_ctr_t X; \
+ uint##W##_t ks[2+1]; \
+ int i; /* avoid size_t to avoid need for stddef.h */ \
+ R123_ASSERT(Nrounds<=32); \
+ ks[2] = SKEIN_KS_PARITY##W; \
+ for (i=0;i < 2; i++) \
+ { \
+ ks[i] = k.v[i]; \
+ X.v[i] = in.v[i]; \
+ ks[2] ^= k.v[i]; \
+ } \
+ \
+ /* Insert initial key before round 0 */ \
+ X.v[0] += ks[0]; X.v[1] += ks[1]; \
+ \
+ if(Nrounds>0){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>1){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>2){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>3){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>3){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[1]; X.v[1] += ks[2]; \
+ X.v[1] += 1; /* X.v[2-1] += r */ \
+ } \
+ if(Nrounds>4){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>5){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>6){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>7){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>7){ \
+ /* InjectKey(r=2) */ \
+ X.v[0] += ks[2]; X.v[1] += ks[0]; \
+ X.v[1] += 2; \
+ } \
+ if(Nrounds>8){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>9){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>10){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>11){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>11){ \
+ /* InjectKey(r=3) */ \
+ X.v[0] += ks[0]; X.v[1] += ks[1]; \
+ X.v[1] += 3; \
+ } \
+ if(Nrounds>12){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>13){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>14){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>15){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>15){ \
+ /* InjectKey(r=4) */ \
+ X.v[0] += ks[1]; X.v[1] += ks[2]; \
+ X.v[1] += 4; \
+ } \
+ if(Nrounds>16){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>17){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>18){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>19){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>19){ \
+ /* InjectKey(r=5) */ \
+ X.v[0] += ks[2]; X.v[1] += ks[0]; \
+ X.v[1] += 5; \
+ } \
+ if(Nrounds>20){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>21){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>22){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>23){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>23){ \
+ /* InjectKey(r=6) */ \
+ X.v[0] += ks[0]; X.v[1] += ks[1]; \
+ X.v[1] += 6; \
+ } \
+ if(Nrounds>24){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>25){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>26){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>27){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>27){ \
+ /* InjectKey(r=7) */ \
+ X.v[0] += ks[1]; X.v[1] += ks[2]; \
+ X.v[1] += 7; \
+ } \
+ if(Nrounds>28){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>29){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>30){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>31){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
+ if(Nrounds>31){ \
+ /* InjectKey(r=8) */ \
+ X.v[0] += ks[2]; X.v[1] += ks[0]; \
+ X.v[1] += 8; \
+ } \
+ return X; \
+} \
+ /** @ingroup ThreefryNxW */ \
+enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE \
+threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
+ return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \
+}
+
+
+#define _threefry4x_tpl(W) \
+typedef struct r123array4x##W threefry4x##W##_ctr_t; \
+typedef struct r123array4x##W threefry4x##W##_key_t; \
+typedef struct r123array4x##W threefry4x##W##_ukey_t; \
+R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE \
+threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
+ threefry4x##W##_ctr_t X; \
+ uint##W##_t ks[4+1]; \
+ int i; /* avoid size_t to avoid need for stddef.h */ \
+ R123_ASSERT(Nrounds<=72); \
+ ks[4] = SKEIN_KS_PARITY##W; \
+ for (i=0;i < 4; i++) \
+ { \
+ ks[i] = k.v[i]; \
+ X.v[i] = in.v[i]; \
+ ks[4] ^= k.v[i]; \
+ } \
+ \
+ /* Insert initial key before round 0 */ \
+ X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
+ \
+ if(Nrounds>0){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>1){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>2){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>3){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>3){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
+ X.v[4-1] += 1; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>4){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>5){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>6){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>7){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>7){ \
+ /* InjectKey(r=2) */ \
+ X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
+ X.v[4-1] += 2; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>8){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>9){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>10){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>11){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>11){ \
+ /* InjectKey(r=3) */ \
+ X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
+ X.v[4-1] += 3; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>12){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>13){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>14){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>15){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>15){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
+ X.v[4-1] += 4; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>16){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>17){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>18){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>19){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>19){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
+ X.v[4-1] += 5; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>20){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>21){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>22){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>23){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>23){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
+ X.v[4-1] += 6; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>24){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>25){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>26){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>27){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>27){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
+ X.v[4-1] += 7; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>28){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>29){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>30){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>31){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>31){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
+ X.v[4-1] += 8; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>32){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>33){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>34){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>35){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>35){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
+ X.v[4-1] += 9; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>36){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>37){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>38){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>39){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>39){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
+ X.v[4-1] += 10; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>40){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>41){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>42){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>43){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>43){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
+ X.v[4-1] += 11; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>44){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>45){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>46){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>47){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>47){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
+ X.v[4-1] += 12; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>48){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>49){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>50){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>51){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>51){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
+ X.v[4-1] += 13; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>52){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>53){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>54){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>55){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>55){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
+ X.v[4-1] += 14; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>56){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>57){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>58){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>59){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>59){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
+ X.v[4-1] += 15; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>60){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>61){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>62){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>63){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>63){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
+ X.v[4-1] += 16; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>64){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>65){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>66){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>67){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>67){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
+ X.v[4-1] += 17; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ if(Nrounds>68){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>69){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>70){ \
+ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+ X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+ } \
+ if(Nrounds>71){ \
+ X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+ X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+ } \
+ if(Nrounds>71){ \
+ /* InjectKey(r=1) */ \
+ X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
+ X.v[4-1] += 18; /* X.v[WCNT4-1] += r */ \
+ } \
+ \
+ return X; \
+} \
+ /** @ingroup ThreefryNxW */ \
+enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE \
+threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
+ return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \
+}
+/** \endcond */
+
+_threefry2x_tpl(64)
+_threefry2x_tpl(32)
+_threefry4x_tpl(64)
+_threefry4x_tpl(32)
+
+/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
+ than a static inline function. Why? */
+#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
+#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
+#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
+#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
+
+#ifdef __cplusplus
+/** \cond HIDDEN_FROM_DOXYGEN */
+#define _threefryNxWclass_tpl(NxW) \
+namespace r123{ \
+template<unsigned int R> \
+ struct Threefry##NxW##_R{ \
+ typedef threefry##NxW##_ctr_t ctr_type; \
+ typedef threefry##NxW##_key_t key_type; \
+ typedef threefry##NxW##_key_t ukey_type; \
+ static const unsigned int rounds=R; \
+ inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
+ R123_STATIC_ASSERT(R<=72, "threefry is only unrolled up to 72 rounds\n"); \
+ return threefry##NxW##_R(R, ctr, key); \
+ } \
+}; \
+ typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \
+} // namespace r123
+
+/** \endcond */
+
+_threefryNxWclass_tpl(2x32)
+_threefryNxWclass_tpl(4x32)
+_threefryNxWclass_tpl(2x64)
+_threefryNxWclass_tpl(4x64)
+
+/* The _tpl macros don't quite work to do string-pasting inside comments.
+ so we just write out the boilerplate documentation four times... */
+
+/**
+@defgroup ThreefryNxW Threefry Classes and Typedefs
+
+The ThreefryNxW classes export the member functions, typedefs and
+operator overloads required by a @ref CBRNG "CBRNG" class.
+
+As described in
+<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers: As Easy as 1, 2, 3</i> </a>,
+the Threefry family is closely related to the Threefish block cipher from
+<a href="http://www.skein-hash.info/"> Skein Hash Function</a>.
+Threefry is \b not suitable for cryptographic use.
+
+Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output.
+
+@class r123::Threefry2x32_R
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=13 or more for Threefry2x32.
+
+@typedef r123::Threefry2x32
+@ingroup ThreefryNxW
+ Threefry2x32 is equivalent to Threefry2x32_R<20>. With 20 rounds,
+ Threefry2x32 has a considerable safety margin over the minimum number
+ of rounds with no known statistical flaws, but still has excellent
+ performance.
+
+@class r123::Threefry2x64_R
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+In November 2011, the authors discovered that 13 rounds of
+Threefry2x64 sequenced by strided, interleaved key and counter
+increments failed a very long (longer than the default BigCrush
+length) WeightDistrub test. At the same time, it was confirmed that
+14 rounds passes much longer tests (up to 5x10^12 samples) of a
+similar nature. The authors know of no statistical flaws with
+ROUNDS=14 or more for Threefry2x64.
+
+@typedef r123::Threefry2x64
+@ingroup ThreefryNxW
+ Threefry2x64 is equivalent to Threefry2x64_R<20>. With 20 rounds,
+ Threefry2x64 has a considerable safety margin over the minimum number
+ of rounds with no known statistical flaws, but still has excellent
+ performance.
+
+
+
+@class r123::Threefry4x32_R
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=12 or more for Threefry4x32.
+
+@typedef r123::Threefry4x32
+@ingroup ThreefryNxW
+ Threefry4x32 is equivalent to Threefry4x32_R<20>. With 20 rounds,
+ Threefry4x32 has a considerable safety margin over the minimum number
+ of rounds with no known statistical flaws, but still has excellent
+ performance.
+
+
+
+@class r123::Threefry4x64_R
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=12 or more for Threefry4x64.
+
+@typedef r123::Threefry4x64
+@ingroup ThreefryNxW
+ Threefry4x64 is equivalent to Threefry4x64_R<20>. With 20 rounds,
+ Threefry4x64 has a considerable safety margin over the minimum number
+ of rounds with no known statistical flaws, but still has excellent
+ performance.
+*/
+
+#endif
+
+#endif
t_state *state_global,
gmx_mtop_t *top_global,
t_forcerec *fr,
- gmx_update_t upd,
gmx_mdoutf_t outf,
t_mdebin *mdebin,
gmx_ekindata_t *ekind,
rvec *f,
rvec *f_global,
gmx_wallcycle_t wcycle,
- gmx_rng_t mcrng,
int *nchkpt,
gmx_bool bCPT,
gmx_bool bRerunMD,
wallcycle_start(wcycle, ewcTRAJ);
if (bCPT)
{
- if (state->flags & (1<<estLD_RNG))
- {
- get_stochd_state(upd, state);
- }
- if (state->flags & (1<<estMC_RNG))
- {
- get_mc_state(mcrng, state);
- }
if (MASTER(cr))
{
if (bSumEkinhOld)
#include "mdoutf.h"
#include "../legacyheaders/types/simple.h"
#include "../legacyheaders/types/commrec.h"
-#include "../legacyheaders/update.h"
#include "../legacyheaders/mdebin.h"
/*! \brief Wrapper routine for writing trajectories during mdrun
t_state *state_global,
gmx_mtop_t *top_global,
t_forcerec *fr,
- gmx_update_t upd,
gmx_mdoutf_t outf,
t_mdebin *mdebin,
gmx_ekindata_t *ekind,
rvec *f,
rvec *f_global,
gmx_wallcycle_t wcycle,
- gmx_rng_t mcrng,
int *nchkpt,
gmx_bool bCPT,
gmx_bool bRerunMD,
#include "txtdump.h"
#include "vec.h"
#include "network.h"
-#include "gromacs/random/random.h"
#include "checkpoint.h"
#include "main.h"
#include "string2.h"
static int do_cpt_state(XDR *xd, gmx_bool bRead,
int fflags, t_state *state,
- gmx_bool bReadRNG, FILE *list)
+ FILE *list)
{
int sflags;
- int **rng_p, **rngi_p;
int i;
int ret;
int nnht, nnhtp;
nnht = state->nhchainlength*state->ngtc;
nnhtp = state->nhchainlength*state->nnhpres;
- if (bReadRNG)
- {
- rng_p = (int **)&state->ld_rng;
- rngi_p = &state->ld_rngi;
- }
- else
- {
- /* Do not read the RNG data */
- rng_p = NULL;
- rngi_p = NULL;
- }
-
if (bRead) /* we need to allocate space for dfhist if we are reading */
{
init_df_history(&state->dfhist, state->dfhist.nlambda);
}
- /* We want the MC_RNG the same across all the notes for now -- lambda MC is global */
-
sflags = state->flags;
for (i = 0; (i < estNR && ret == 0); i++)
{
case estX: ret = do_cpte_rvecs(xd, cptpEST, i, sflags, state->natoms, &state->x, list); break;
case estV: ret = do_cpte_rvecs(xd, cptpEST, i, sflags, state->natoms, &state->v, list); break;
case estSDX: ret = do_cpte_rvecs(xd, cptpEST, i, sflags, state->natoms, &state->sd_X, list); break;
- case estLD_RNG: ret = do_cpte_ints(xd, cptpEST, i, sflags, state->nrng, rng_p, list); break;
- case estLD_RNGI: ret = do_cpte_ints(xd, cptpEST, i, sflags, state->nrngi, rngi_p, list); break;
- case estMC_RNG: ret = do_cpte_ints(xd, cptpEST, i, sflags, state->nmcrng, (int **)&state->mc_rng, list); break;
- case estMC_RNGI: ret = do_cpte_ints(xd, cptpEST, i, sflags, 1, &state->mc_rngi, list); break;
+ /* The RNG entries are no longer written,
+ * the next 4 lines are only for reading old files.
+ */
+ case estLD_RNG: ret = do_cpte_ints(xd, cptpEST, i, sflags, 0, NULL, list); break;
+ case estLD_RNGI: ret = do_cpte_ints(xd, cptpEST, i, sflags, 0, NULL, list); break;
+ case estMC_RNG: ret = do_cpte_ints(xd, cptpEST, i, sflags, 0, NULL, list); break;
+ case estMC_RNGI: ret = do_cpte_ints(xd, cptpEST, i, sflags, 0, NULL, list); break;
case estDISRE_INITF: ret = do_cpte_real (xd, cptpEST, i, sflags, &state->hist.disre_initf, list); break;
case estDISRE_RM3TAV: ret = do_cpte_n_reals(xd, cptpEST, i, sflags, &state->hist.ndisrepairs, &state->hist.disre_rm3tav, list); break;
case estORIRE_INITF: ret = do_cpte_real (xd, cptpEST, i, sflags, &state->hist.orire_initf, list); break;
sfree(bhost);
sfree(fprog);
- if ((do_cpt_state(gmx_fio_getxdr(fp), FALSE, state->flags, state, TRUE, NULL) < 0) ||
+ if ((do_cpt_state(gmx_fio_getxdr(fp), FALSE, state->flags, state, NULL) < 0) ||
(do_cpt_ekinstate(gmx_fio_getxdr(fp), flags_eks, &state->ekinstate, NULL) < 0) ||
(do_cpt_enerhist(gmx_fio_getxdr(fp), FALSE, flags_enh, &state->enerhist, NULL) < 0) ||
(do_cpt_df_hist(gmx_fio_getxdr(fp), flags_dfh, &state->dfhist, NULL) < 0) ||
static void read_checkpoint(const char *fn, FILE **pfplog,
t_commrec *cr, ivec dd_nc,
int eIntegrator, int *init_fep_state, gmx_int64_t *step, double *t,
- t_state *state, gmx_bool *bReadRNG, gmx_bool *bReadEkin,
+ t_state *state, gmx_bool *bReadEkin,
int *simulation_part,
gmx_bool bAppendOutputFiles, gmx_bool bForceAppend)
{
const char *int_warn =
"WARNING: The checkpoint file was generated with integrator %s,\n"
" while the simulation uses integrator %s\n\n";
- const char *sd_note =
- "NOTE: The checkpoint file was for %d nodes doing SD or BD,\n"
- " while the simulation uses %d SD or BD nodes,\n"
- " continuation will be exact, except for the random state\n\n";
#if !defined __native_client__ && !defined GMX_NATIVE_WINDOWS
fl.l_type = F_WRLCK;
nppnodes = -1;
}
- if ((EI_SD(eIntegrator) || eIntegrator == eiBD) && nppnodes > 0)
- {
- /* Correct the RNG state size for the number of PP nodes.
- * Such assignments should all be moved to one central function.
- */
- state->nrng = nppnodes*gmx_rng_n();
- state->nrngi = nppnodes;
- }
-
- *bReadRNG = TRUE;
if (fflags != state->flags)
{
}
else
{
- if ((EI_SD(eIntegrator) || eIntegrator == eiBD) &&
- nppnodes != nppnodes_f)
- {
- *bReadRNG = FALSE;
- if (MASTER(cr))
- {
- fprintf(stderr, sd_note, nppnodes_f, nppnodes);
- }
- if (fplog)
- {
- fprintf(fplog, sd_note, nppnodes_f, nppnodes);
- }
- }
if (MASTER(cr))
{
check_match(fplog, version, btime, buser, bhost, double_prec, fprog,
cr, nppnodes_f, npmenodes_f, dd_nc, dd_nc_f);
}
}
- ret = do_cpt_state(gmx_fio_getxdr(fp), TRUE, fflags, state, *bReadRNG, NULL);
+ ret = do_cpt_state(gmx_fio_getxdr(fp), TRUE, fflags, state, NULL);
*init_fep_state = state->fep_state; /* there should be a better way to do this than setting it here.
Investigate for 5.0. */
if (ret)
void load_checkpoint(const char *fn, FILE **fplog,
t_commrec *cr, ivec dd_nc,
t_inputrec *ir, t_state *state,
- gmx_bool *bReadRNG, gmx_bool *bReadEkin,
+ gmx_bool *bReadEkin,
gmx_bool bAppend, gmx_bool bForceAppend)
{
gmx_int64_t step;
/* Read the state from the checkpoint file */
read_checkpoint(fn, fplog,
cr, dd_nc,
- ir->eI, &(ir->fepvals->init_fep_state), &step, &t, state, bReadRNG, bReadEkin,
+ ir->eI, &(ir->fepvals->init_fep_state), &step, &t, state, bReadEkin,
&ir->simulation_part, bAppend, bForceAppend);
}
if (PAR(cr))
gmx_bcast(sizeof(cr->npmenodes), &cr->npmenodes, cr);
gmx_bcast(DIM*sizeof(dd_nc[0]), dd_nc, cr);
gmx_bcast(sizeof(step), &step, cr);
- gmx_bcast(sizeof(*bReadRNG), bReadRNG, cr);
gmx_bcast(sizeof(*bReadEkin), bReadEkin, cr);
}
ir->bContinuation = TRUE;
static void read_checkpoint_data(t_fileio *fp, int *simulation_part,
gmx_int64_t *step, double *t, t_state *state,
- gmx_bool bReadRNG,
int *nfiles, gmx_file_position_t **outputfiles)
{
int file_version;
&(state->dfhist.nlambda), &state->flags, &flags_eks, &flags_enh, &flags_dfh,
&state->edsamstate.nED, &state->swapstate.eSwapCoords, NULL);
ret =
- do_cpt_state(gmx_fio_getxdr(fp), TRUE, state->flags, state, bReadRNG, NULL);
+ do_cpt_state(gmx_fio_getxdr(fp), TRUE, state->flags, state, NULL);
if (ret)
{
cp_error();
t_fileio *fp;
fp = gmx_fio_open(fn, "r");
- read_checkpoint_data(fp, simulation_part, step, t, state, FALSE, NULL, NULL);
+ read_checkpoint_data(fp, simulation_part, step, t, state, NULL, NULL);
if (gmx_fio_close(fp) != 0)
{
gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
init_state(&state, 0, 0, 0, 0, 0);
- read_checkpoint_data(fp, &simulation_part, &step, &t, &state, FALSE, NULL, NULL);
+ read_checkpoint_data(fp, &simulation_part, &step, &t, &state, NULL, NULL);
fr->natoms = state.natoms;
fr->bTitle = FALSE;
&(state.dfhist.nlambda), &state.flags,
&flags_eks, &flags_enh, &flags_dfh, &state.edsamstate.nED,
&state.swapstate.eSwapCoords, out);
- ret = do_cpt_state(gmx_fio_getxdr(fp), TRUE, state.flags, &state, TRUE, out);
+ ret = do_cpt_state(gmx_fio_getxdr(fp), TRUE, state.flags, &state, out);
if (ret)
{
cp_error();
{
init_state(&state, 0, 0, 0, 0, 0);
- read_checkpoint_data(fp, simulation_part, &step, &t, &state, FALSE,
+ read_checkpoint_data(fp, simulation_part, &step, &t, &state,
&nfiles, &outputfiles);
if (gmx_fio_close(fp) != 0)
{
block_bc(cr, state->ngtc);
block_bc(cr, state->nnhpres);
block_bc(cr, state->nhchainlength);
- block_bc(cr, state->nrng);
- block_bc(cr, state->nrngi);
block_bc(cr, state->flags);
if (state->lambda == NULL)
{
case estV: nblock_abc(cr, state->natoms, state->v); break;
case estSDX: nblock_abc(cr, state->natoms, state->sd_X); break;
case estCGP: nblock_abc(cr, state->natoms, state->cg_p); break;
- case estLD_RNG: if (state->nrngi == 1)
- {
- nblock_abc(cr, state->nrng, state->ld_rng);
- }
- break;
- case estLD_RNGI: if (state->nrngi == 1)
- {
- nblock_abc(cr, state->nrngi, state->ld_rngi);
- }
- break;
case estDISRE_INITF: block_bc(cr, state->hist.disre_initf); break;
case estDISRE_RM3TAV:
block_bc(cr, state->hist.ndisrepairs);
int i;
state->natoms = natoms;
- state->nrng = 0;
state->flags = 0;
state->lambda = 0;
snew(state->lambda, efptNR);
{
state_local->lambda[i] = state_global->lambda[i];
}
- if (state_global->nrngi > 1)
- {
- /* With stochastic dynamics we need local storage for the random state */
- if (state_local->flags & (1<<estLD_RNG))
- {
- state_local->nrng = gmx_rng_n();
- snew(state_local->ld_rng, state_local->nrng);
- }
- if (state_local->flags & (1<<estLD_RNGI))
- {
- snew(state_local->ld_rngi, 1);
- }
- }
return state_local;
}
*/
void load_checkpoint(const char *fn, FILE **fplog,
t_commrec *cr, ivec dd_nc,
- t_inputrec *ir, t_state *state, gmx_bool *bReadRNG,
- gmx_bool *bReadEkin, gmx_bool bAppend, gmx_bool bForceAppend);
+ t_inputrec *ir, t_state *state,
+ gmx_bool *bReadEkin,
+ gmx_bool bAppend, gmx_bool bForceAppend);
/* Read the state from checkpoint file.
* Arrays in state that are NULL are allocated.
#define MD_DDBONDCOMM (1<<11)
#define MD_CONFOUT (1<<12)
#define MD_REPRODUCIBLE (1<<13)
-#define MD_READ_RNG (1<<14)
#define MD_APPENDFILES (1<<15)
#define MD_APPENDFILESSET (1<<21)
#define MD_KEEPANDNUMCPT (1<<16)
void init_npt_masses(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bInit);
-void init_expanded_ensemble(gmx_bool bStateFromCP, t_inputrec *ir, gmx_rng_t *mcrng, df_history_t *dfhist);
+void init_expanded_ensemble(gmx_bool bStateFromCP, t_inputrec *ir, df_history_t *dfhist);
int ExpandedEnsembleDynamics(FILE *log, t_inputrec *ir, gmx_enerdata_t *enerd,
t_state *state, t_extmass *MassQ, int fep_state, df_history_t *dfhist,
- gmx_int64_t step, gmx_rng_t mcrng,
+ gmx_int64_t step,
rvec *v, t_mdatoms *mdatoms);
void PrintFreeEnergyInfoToFile(FILE *outfile, t_lambda *fep, t_expanded *expand, t_simtemp *simtemp, df_history_t *dfhist,
int fep_state, int frequency, gmx_int64_t step);
-void get_mc_state(gmx_rng_t rng, t_state *state);
-
-void set_mc_state(gmx_rng_t rng, t_state *state);
-
/* check the version */
void check_ir_old_tpx_versions(t_commrec *cr, FILE *fplog,
t_inputrec *ir, gmx_mtop_t *mtop);
/* Allocate and initialize node-local state entries. */
-void set_state_entries(t_state *state, const t_inputrec *ir, int nnodes);
+void set_state_entries(t_state *state, const t_inputrec *ir);
-/* Broadcast the data for a simulation, and allocate node-specific settings
- such as rng generators. */
+/* Broadcast the data for a simulation, and allocate node-specific settings */
void init_parallel(t_commrec *cr, t_inputrec *inputrec,
gmx_mtop_t *mtop);
int natoms;
int ngtc;
int nnhpres;
- int nhchainlength; /* number of nose-hoover chains */
- int nrng;
- int nrngi;
+ int nhchainlength; /* number of nose-hoover chains */
int flags; /* Flags telling which entries are present */
int fep_state; /* indicates which of the alchemical states we are in */
real *lambda; /* lambda vector */
rvec *sd_X; /* random part of the x update for stoch. dyn. */
rvec *cg_p; /* p vector for conjugate gradient minimization */
- unsigned int *ld_rng; /* RNG random state */
- int *ld_rngi; /* RNG index */
-
- int nmcrng; /* number of RNG states */
- unsigned int *mc_rng; /* lambda MC RNG random state */
- int *mc_rngi; /* lambda MC RNG index */
-
history_t hist; /* Time history for restraints */
ekinstate_t ekinstate; /* The state of the kinetic energy data */
#include "mshift.h"
#include "tgroup.h"
#include "network.h"
-#include "gromacs/random/random.h"
#include "vec.h"
t_inputrec *inputrec,
t_state *state,
gmx_ekindata_t *ekind,
- gmx_update_t upd,
t_extmass *MassQ,
t_mdatoms *md
);
/* Return TRUE if OK, FALSE in case of Shake Error */
-extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_int64_t step, t_mdatoms *md, t_state *state, gmx_update_t upd, t_idef *idef, gmx_constr_t constr);
+extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_int64_t step, const t_commrec *cr, t_mdatoms *md, t_state *state, gmx_update_t upd, gmx_constr_t constr);
void update_constraints(FILE *fplog,
gmx_int64_t step,
void berendsen_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt);
-void andersen_tcoupl(t_inputrec *ir, t_mdatoms *md, t_state *state, gmx_rng_t rng, real rate, t_idef *idef, int nblocks, int *sblock, gmx_bool *randatom, int *randatom_list, gmx_bool *randomize, real *boltzfac);
+void andersen_tcoupl(t_inputrec *ir, gmx_int64_t step,
+ const t_commrec *cr, const t_mdatoms *md, t_state *state, real rate, const gmx_bool *randomize, const real *boltzfac);
void nosehoover_tcoupl(t_grpopts *opts, gmx_ekindata_t *ekind, real dt,
double xi[], double vxi[], t_extmass *MassQ);
void NBaroT_trotter(t_grpopts *opts, real dt,
double xi[], double vxi[], real *veta, t_extmass *MassQ);
-void vrescale_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt,
- double therm_integral[],
- gmx_rng_t rng);
+void vrescale_tcoupl(t_inputrec *ir, gmx_int64_t step,
+ gmx_ekindata_t *ekind, real dt,
+ double therm_integral[]);
/* Compute temperature scaling. For V-rescale it is done in update. */
real vrescale_energy(t_grpopts *opts, double therm_integral[]);
}
}
-static int poisson_variate(real lambda, gmx_rng_t rng)
+void andersen_tcoupl(t_inputrec *ir, gmx_int64_t step,
+ const t_commrec *cr, const t_mdatoms *md, t_state *state, real rate, const gmx_bool *randomize, const real *boltzfac)
{
+ const int *gatindex = (DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
+ int i;
+ int gc = 0;
- real L;
- int k = 0;
- real p = 1.0;
-
- L = exp(-lambda);
-
- do
- {
- k = k+1;
- p *= gmx_rng_uniform_real(rng);
- }
- while (p > L);
-
- return k-1;
-}
-
-void andersen_tcoupl(t_inputrec *ir, t_mdatoms *md, t_state *state, gmx_rng_t rng, real rate, t_idef *idef, int nblocks, int *sblock, gmx_bool *randatom, int *randatom_list, gmx_bool *randomize, real *boltzfac)
-{
- t_grpopts *opts;
- int i, j, k, d, len, n, ngtc, gc = 0;
- int nshake, nsettle, nrandom, nrand_group;
- real boltz, scal, reft, prand;
- t_iatom *iatoms;
-
- /* convenience variables */
- opts = &ir->opts;
- ngtc = opts->ngtc;
+ /* randomize the velocities of the selected particles */
- /* idef is only passed in if it's chance-per-particle andersen, so
- it essentially serves as a boolean to determine which type of
- andersen is being used */
- if (idef)
+ for (i = 0; i < md->homenr; i++) /* now loop over the list of atoms */
{
+ int ng = gatindex ? gatindex[i] : i;
+ gmx_bool bRandomize;
- /* randomly atoms to randomize. However, all constraint
- groups have to have either all of the atoms or none of the
- atoms randomize.
-
- Algorithm:
- 1. Select whether or not to randomize each atom to get the correct probability.
- 2. Cycle through the constraint groups.
- 2a. for each constraint group, determine the fraction f of that constraint group that are
- chosen to be randomized.
- 2b. all atoms in the constraint group are randomized with probability f.
- */
-
- nrandom = 0;
- if ((rate < 0.05) && (md->homenr > 50))
- {
- /* if the rate is relatively high, use a standard method, if low rate,
- * use poisson */
- /* poisson distributions approxmation, more efficient for
- * low rates, fewer random numbers required */
- nrandom = poisson_variate(md->homenr*rate, rng); /* how many do we randomize? Use Poisson. */
- /* now we know how many, choose them randomly. No worries about repeats, at this rate, it's negligible.
- worst thing that happens, it lowers the true rate an negligible amount */
- for (i = 0; i < nrandom; i++)
- {
- randatom[(int)(gmx_rng_uniform_real(rng)*md->homenr)] = TRUE;
- }
- }
- else
+ if (md->cTC)
{
- for (i = 0; i < md->homenr; i++)
- {
- if (gmx_rng_uniform_real(rng) < rate)
- {
- randatom[i] = TRUE;
- nrandom++;
- }
- }
+ gc = md->cTC[i]; /* assign the atom to a temperature group if there are more than one */
}
-
- /* instead of looping over the constraint groups, if we had a
- list of which atoms were in which constraint groups, we
- could then loop over only the groups that are randomized
- now. But that is not available now. Create later after
- determining whether there actually is any slowing. */
-
- /* first, loop through the settles to make sure all groups either entirely randomized, or not randomized. */
-
- nsettle = idef->il[F_SETTLE].nr/2;
- for (i = 0; i < nsettle; i++)
+ if (randomize[gc])
{
- iatoms = idef->il[F_SETTLE].iatoms;
- nrand_group = 0;
- for (k = 0; k < 3; k++) /* settles are always 3 atoms, hardcoded */
+ if (ir->etc == etcANDERSEN)
{
- if (randatom[iatoms[2*i+1]+k])
- {
- nrand_group++; /* count the number of atoms to be shaken in the settles group */
- randatom[iatoms[2*i+1]+k] = FALSE;
- nrandom--;
- }
+ bRandomize = TRUE;
}
- if (nrand_group > 0)
+ else
{
- prand = (nrand_group)/3.0; /* use this fraction to compute the probability the
- whole group is randomized */
- if (gmx_rng_uniform_real(rng) < prand)
- {
- for (k = 0; k < 3; k++)
- {
- randatom[iatoms[2*i+1]+k] = TRUE; /* mark them all to be randomized */
- }
- nrandom += 3;
- }
- }
- }
+ double uniform[2];
- /* now loop through the shake groups */
- nshake = nblocks;
- for (i = 0; i < nshake; i++)
- {
- iatoms = &(idef->il[F_CONSTR].iatoms[sblock[i]]);
- len = sblock[i+1]-sblock[i];
- nrand_group = 0;
- for (k = 0; k < len; k++)
- {
- if (k%3 != 0)
- { /* only 2/3 of the sblock items are atoms, the others are labels */
- if (randatom[iatoms[k]])
- {
- nrand_group++;
- randatom[iatoms[k]] = FALSE; /* need to mark it false here in case the atom is in more than
- one group in the shake block */
- nrandom--;
- }
- }
- }
- if (nrand_group > 0)
- {
- prand = (nrand_group)/(1.0*(2*len/3));
- if (gmx_rng_uniform_real(rng) < prand)
- {
- for (k = 0; k < len; k++)
- {
- if (k%3 != 0)
- { /* only 2/3 of the sblock items are atoms, the others are labels */
- randatom[iatoms[k]] = TRUE; /* randomize all of them */
- nrandom++;
- }
- }
- }
+ gmx_rng_cycle_2uniform(step*2, ng, ir->andersen_seed, RND_SEED_ANDERSEN, uniform);
+ bRandomize = (uniform[0] < rate);
}
- }
- if (nrandom > 0)
- {
- n = 0;
- for (i = 0; i < md->homenr; i++) /* now loop over the list of atoms */
+ if (bRandomize)
{
- if (randatom[i])
+ real scal, gauss[3];
+ int d;
+
+ scal = sqrt(boltzfac[gc]*md->invmass[i]);
+ gmx_rng_cycle_3gaussian_table(step*2+1, ng, ir->andersen_seed, RND_SEED_ANDERSEN, gauss);
+ for (d = 0; d < DIM; d++)
{
- randatom_list[n] = i;
- n++;
+ state->v[i][d] = scal*gauss[d];
}
}
- nrandom = n; /* there are some values of nrandom for which
- this algorithm won't work; for example all
- water molecules and nrandom =/= 3. Better to
- recount and use this number (which we
- calculate anyway: it will not affect
- the average number of atoms accepted.
- */
- }
- }
- else
- {
- /* if it's andersen-massive, then randomize all the atoms */
- nrandom = md->homenr;
- for (i = 0; i < nrandom; i++)
- {
- randatom_list[i] = i;
- }
- }
-
- /* randomize the velocities of the selected particles */
-
- for (i = 0; i < nrandom; i++) /* now loop over the list of atoms */
- {
- n = randatom_list[i];
- if (md->cTC)
- {
- gc = md->cTC[n]; /* assign the atom to a temperature group if there are more than one */
- }
- if (randomize[gc])
- {
- scal = sqrt(boltzfac[gc]*md->invmass[n]);
- for (d = 0; d < DIM; d++)
- {
- state->v[n][d] = scal*gmx_rng_gaussian_table(rng);
- }
}
- randatom[n] = FALSE; /* unmark this atom for randomization */
}
}
return ener_npt;
}
-static real vrescale_gamdev(int ia, gmx_rng_t rng)
+static real vrescale_gamdev(int ia,
+ gmx_int64_t step, gmx_int64_t *count,
+ gmx_int64_t seed1, gmx_int64_t seed2)
/* Gamma distribution, adapted from numerical recipes */
{
- int j;
- real am, e, s, v1, v2, x, y;
+ int j;
+ real am, e, s, v1, v2, x, y;
+ double rnd[2];
if (ia < 6)
{
x = 1.0;
for (j = 1; j <= ia; j++)
{
- x *= gmx_rng_uniform_real(rng);
+ gmx_rng_cycle_2uniform(step, (*count)++, seed1, seed2, rnd);
+ x *= rnd[0];
}
}
while (x == 0);
{
do
{
- v1 = gmx_rng_uniform_real(rng);
- v2 = 2.0*gmx_rng_uniform_real(rng)-1.0;
+ gmx_rng_cycle_2uniform(step, (*count)++, seed1, seed2, rnd);
+ v1 = rnd[0];
+ v2 = 2.0*rnd[1] - 1.0;
}
while (v1*v1 + v2*v2 > 1.0 ||
v1*v1*GMX_REAL_MAX < 3.0*ia);
}
while (x <= 0.0);
e = (1.0 + y*y)*exp(am*log(x/am) - s*y);
+
+ gmx_rng_cycle_2uniform(step, (*count)++, seed1, seed2, rnd);
}
- while (gmx_rng_uniform_real(rng) > e);
+ while (rnd[0] > e);
}
return x;
}
-static real vrescale_sumnoises(int nn, gmx_rng_t rng)
+static real gaussian_count(gmx_int64_t step, gmx_int64_t *count,
+ gmx_int64_t seed1, gmx_int64_t seed2)
+{
+ double rnd[2], x, y, r;
+
+ do
+ {
+ gmx_rng_cycle_2uniform(step, (*count)++, seed1, seed2, rnd);
+ x = 2.0*rnd[0] - 1.0;
+ y = 2.0*rnd[1] - 1.0;
+ r = x*x + y*y;
+ }
+ while (r > 1.0 || r == 0.0);
+
+ r = sqrt(-2.0*log(r)/r);
+
+ return x*r;
+}
+
+static real vrescale_sumnoises(int nn,
+ gmx_int64_t step, gmx_int64_t *count,
+ gmx_int64_t seed1, gmx_int64_t seed2)
{
/*
* Returns the sum of n independent gaussian noises squared
* (i.e. equivalent to summing the square of the return values
* of nn calls to gmx_rng_gaussian_real).xs
*/
- real rr;
+ real r, gauss;
- if (nn == 0)
- {
- return 0.0;
- }
- else if (nn == 1)
- {
- rr = gmx_rng_gaussian_real(rng);
- return rr*rr;
- }
- else if (nn % 2 == 0)
- {
- return 2.0*vrescale_gamdev(nn/2, rng);
- }
- else
+ r = 2.0*vrescale_gamdev(nn/2, step, count, seed1, seed2);
+
+ if (nn % 2 == 1)
{
- rr = gmx_rng_gaussian_real(rng);
- return 2.0*vrescale_gamdev((nn-1)/2, rng) + rr*rr;
+ gauss = gaussian_count(step, count, seed1, seed2);
+
+ r += gauss*gauss;
}
+
+ return r;
}
static real vrescale_resamplekin(real kk, real sigma, int ndeg, real taut,
- gmx_rng_t rng)
+ gmx_int64_t step, gmx_int64_t seed)
{
/*
* Generates a new value for the kinetic energy,
* ndeg: number of degrees of freedom of the atoms to be thermalized
* taut: relaxation time of the thermostat, in units of 'how often this routine is called'
*/
- real factor, rr;
+ /* rnd_count tracks the step-local state for the cycle random
+ * number generator.
+ */
+ gmx_int64_t rnd_count = 0;
+ real factor, rr, ekin_new;
if (taut > 0.1)
{
{
factor = 0.0;
}
- rr = gmx_rng_gaussian_real(rng);
- return
+
+ rr = gaussian_count(step, &rnd_count, seed, RND_SEED_VRESCALE);
+
+ ekin_new =
kk +
- (1.0 - factor)*(sigma*(vrescale_sumnoises(ndeg-1, rng) + rr*rr)/ndeg - kk) +
+ (1.0 - factor)*(sigma*(vrescale_sumnoises(ndeg-1, step, &rnd_count, seed, RND_SEED_VRESCALE) + rr*rr)/ndeg - kk) +
2.0*rr*sqrt(kk*sigma/ndeg*(1.0 - factor)*factor);
+
+ return ekin_new;
}
-void vrescale_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt,
- double therm_integral[], gmx_rng_t rng)
+void vrescale_tcoupl(t_inputrec *ir, gmx_int64_t step,
+ gmx_ekindata_t *ekind, real dt,
+ double therm_integral[])
{
t_grpopts *opts;
int i;
Ek_ref = Ek_ref1*opts->nrdf[i];
Ek_new = vrescale_resamplekin(Ek, Ek_ref, opts->nrdf[i],
- opts->tau_t[i]/dt, rng);
+ opts->tau_t[i]/dt,
+ ir->ld_seed, step);
/* Analytically Ek_new>=0, but we check for rounding errors */
if (Ek_new <= 0)
case estCGP:
dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
break;
- case estLD_RNG:
- if (state->nrngi == 1)
- {
- if (DDMASTER(dd))
- {
- for (i = 0; i < state_local->nrng; i++)
- {
- state->ld_rng[i] = state_local->ld_rng[i];
- }
- }
- }
- else
- {
- dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
- state_local->ld_rng, state->ld_rng);
- }
- break;
- case estLD_RNGI:
- if (state->nrngi == 1)
- {
- if (DDMASTER(dd))
- {
- state->ld_rngi[0] = state_local->ld_rngi[0];
- }
- }
- else
- {
- dd_gather(dd, sizeof(state->ld_rngi[0]),
- state_local->ld_rngi, state->ld_rngi);
- }
- break;
case estDISRE_INITF:
case estDISRE_RM3TAV:
case estORIRE_INITF:
case estCGP:
srenew(state->cg_p, state->nalloc);
break;
- case estLD_RNG:
- case estLD_RNGI:
case estDISRE_INITF:
case estDISRE_RM3TAV:
case estORIRE_INITF:
case estCGP:
dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
break;
- case estLD_RNG:
- if (state->nrngi == 1)
- {
- dd_bcastc(dd,
- state_local->nrng*sizeof(state_local->ld_rng[0]),
- state->ld_rng, state_local->ld_rng);
- }
- else
- {
- dd_scatter(dd,
- state_local->nrng*sizeof(state_local->ld_rng[0]),
- state->ld_rng, state_local->ld_rng);
- }
- break;
- case estLD_RNGI:
- if (state->nrngi == 1)
- {
- dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
- state->ld_rngi, state_local->ld_rngi);
- }
- else
- {
- dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
- state->ld_rngi, state_local->ld_rngi);
- }
- break;
case estDISRE_INITF:
case estDISRE_RM3TAV:
case estORIRE_INITF:
#include "vec.h"
#include "pbc.h"
#include "chargegroup.h"
-#include "gromacs/random/random.h"
#include "gromacs/gmxlib/topsort.h"
#include "mtop_util.h"
#include "mshift.h"
init_state(state_local, 0, buf[1], buf[2], buf[3], buf[4]);
state_local->flags = buf[0];
-
- /* With Langevin Dynamics we need to make proper storage space
- * in the global and local state for the random numbers.
- */
- if (state_local->flags & (1<<estLD_RNG))
- {
- if (DDMASTER(dd) && state_global->nrngi > 1)
- {
- state_global->nrng = dd->nnodes*gmx_rng_n();
- srenew(state_global->ld_rng, state_global->nrng);
- }
- state_local->nrng = gmx_rng_n();
- snew(state_local->ld_rng, state_local->nrng);
- }
- if (state_local->flags & (1<<estLD_RNGI))
- {
- if (DDMASTER(dd) && state_global->nrngi > 1)
- {
- state_global->nrngi = dd->nnodes;
- srenew(state_global->ld_rngi, state_global->nrngi);
- }
- snew(state_local->ld_rngi, 1);
- }
}
static void check_link(t_blocka *link, int cg_gl, int cg_gl_j)
/* Eventually should contain all the functions needed to initialize expanded ensemble
before the md loop starts */
-extern void init_expanded_ensemble(gmx_bool bStateFromCP, t_inputrec *ir, gmx_rng_t *mcrng, df_history_t *dfhist)
+extern void init_expanded_ensemble(gmx_bool bStateFromCP, t_inputrec *ir, df_history_t *dfhist)
{
- *mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
if (!bStateFromCP)
{
init_df_history_weights(dfhist, ir->expandedvals, ir->fepvals->n_lambda);
return FALSE;
}
-static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, int fep_state, real *weighted_lamee, double *p_k, gmx_rng_t rng)
+static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, int fep_state, real *weighted_lamee, double *p_k,
+ gmx_int64_t seed, gmx_int64_t step)
{
/* Choose new lambda value, and update transition matrix */
for (i = 0; i < expand->lmc_repeats; i++)
{
+ double rnd[2];
+
+ gmx_rng_cycle_2uniform(step, i, seed, RND_SEED_EXPANDED, rnd);
for (ifep = 0; ifep < nlim; ifep++)
{
accept[ifep] = 1.0;
}
/* Gibbs sampling */
- r1 = gmx_rng_uniform_real(rng);
+ r1 = rnd[0];
for (lamnew = minfep; lamnew <= maxfep; lamnew++)
{
if (r1 <= p_k[lamnew])
}
}
- r1 = gmx_rng_uniform_real(rng);
+ r1 = rnd[0];
for (lamtrial = minfep; lamtrial <= maxfep; lamtrial++)
{
pnorm = p_k[lamtrial]/remainder[fep_state];
{
tprob = trialprob;
}
- r2 = gmx_rng_uniform_real(rng);
+ r2 = rnd[1];
if (r2 < tprob)
{
lamnew = lamtrial;
else if ((expand->elmcmove == elmcmoveMETROPOLIS) || (expand->elmcmove == elmcmoveBARKER))
{
/* use the metropolis sampler with trial +/- 1 */
- r1 = gmx_rng_uniform_real(rng);
+ r1 = rnd[0];
if (r1 < 0.5)
{
if (fep_state == 0)
accept[lamtrial] = 1.0;
}
- r2 = gmx_rng_uniform_real(rng);
+ r2 = rnd[1];
if (r2 < tprob)
{
lamnew = lamtrial;
}
}
-extern void get_mc_state(gmx_rng_t rng, t_state *state)
-{
- gmx_rng_get_state(rng, state->mc_rng, state->mc_rngi);
-}
-
-extern void set_mc_state(gmx_rng_t rng, t_state *state)
-{
- gmx_rng_set_state(rng, state->mc_rng, state->mc_rngi[0]);
-}
-
extern int ExpandedEnsembleDynamics(FILE *log, t_inputrec *ir, gmx_enerdata_t *enerd,
t_state *state, t_extmass *MassQ, int fep_state, df_history_t *dfhist,
- gmx_int64_t step, gmx_rng_t mcrng,
+ gmx_int64_t step,
rvec *v, t_mdatoms *mdatoms)
/* Note that the state variable is only needed for simulated tempering, not
Hamiltonian expanded ensemble. May be able to remove it after integrator refactoring. */
}
}
- lamnew = ChooseNewLambda(nlim, expand, dfhist, fep_state, weighted_lamee, p_k, mcrng);
+ lamnew = ChooseNewLambda(nlim, expand, dfhist, fep_state, weighted_lamee, p_k,
+ ir->expandedvals->lmc_seed, step);
/* if using simulated tempering, we need to adjust the temperatures */
if (ir->bSimTemp && (lamnew != fep_state)) /* only need to change the temperatures if we change the state */
{
#include "mdrun.h"
#include "names.h"
#include "calcgrid.h"
-#include "gromacs/random/random.h"
#include "update.h"
#include "mdebin.h"
#define NOT_FINISHED(l1, l2) \
printf("not finished yet: lines %d .. %d in %s\n", l1, l2, __FILE__)
-void set_state_entries(t_state *state, const t_inputrec *ir, int nnodes)
+void set_state_entries(t_state *state, const t_inputrec *ir)
{
int nnhpres;
snew(state->cg_p, state->nalloc);
}
}
- if (EI_SD(ir->eI) || ir->eI == eiBD || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc))
- {
- state->nrng = gmx_rng_n();
- state->nrngi = 1;
- if (EI_SD(ir->eI) || ir->eI == eiBD || ETC_ANDERSEN(ir->etc))
- {
- /* This will be correct later with DD */
- state->nrng *= nnodes;
- state->nrngi *= nnodes;
- }
- state->flags |= ((1<<estLD_RNG) | (1<<estLD_RNGI));
- snew(state->ld_rng, state->nrng);
- snew(state->ld_rngi, state->nrngi);
- }
- else
- {
- state->nrng = 0;
- }
-
- if (ir->bExpanded)
- {
- state->nmcrng = gmx_rng_n();
- snew(state->mc_rng, state->nmcrng);
- snew(state->mc_rngi, 1);
- }
state->nnhpres = 0;
if (ir->ePBC != epbcNONE)
gmx_mtop_t *mtop)
{
bcast_ir_mtop(cr, inputrec, mtop);
-
- if (inputrec->eI == eiBD || EI_SD(inputrec->eI) || ETC_ANDERSEN(inputrec->etc))
- {
- /* Make sure the random seeds are different on each node */
- inputrec->ld_seed += cr->nodeid;
- }
}
#include "main.h"
#include "force.h"
#include "macros.h"
-#include "gromacs/random/random.h"
#include "names.h"
#include "gmx_fatal.h"
#include "txtdump.h"
#include "constr.h"
#include "xvgr.h"
#include "copyrite.h"
-#include "gromacs/random/random.h"
#include "domdec.h"
#include "genborn.h"
#include "nbnxn_atomdata.h"
double embU, sum_embU, *sum_UgembU, V, V_all, VembU_all;
t_trxstatus *status;
t_trxframe rerun_fr;
- gmx_bool bDispCorr, bCharge, bRFExcl, bNotLastFrame, bStateChanged, bNS, bOurStep;
+ gmx_bool bDispCorr, bCharge, bRFExcl, bNotLastFrame, bStateChanged, bNS;
tensor force_vir, shake_vir, vir, pres;
int cg_tp, a_tp0, a_tp1, ngid, gid_tp, nener, e;
rvec *x_mol;
rvec mu_tot, x_init, dx, x_tp;
- int nnodes, frame, nsteps, step;
+ int nnodes, frame;
+ gmx_int64_t frame_step_prev, frame_step;
+ gmx_int64_t nsteps, stepblocksize = 0, step;
+ gmx_int64_t rnd_count_stride, rnd_count;
+ gmx_int64_t seed;
+ double rnd[4];
int i, start, end;
- gmx_rng_t tpi_rand;
FILE *fp_tpi = NULL;
char *ptr, *dump_pdb, **leg, str[STRLEN], str2[STRLEN];
double dbl, dump_ener;
a_tp1-a_tp0, bCharge ? "with" : "without");
fprintf(fplog, "\nWill insert %d times in each frame of %s\n",
- nsteps, opt2fn("-rerun", nfile, fnm));
+ (int)nsteps, opt2fn("-rerun", nfile, fnm));
}
if (!bCavity)
}
snew(sum_UgembU, nener);
- /* Initialize random generator */
- tpi_rand = gmx_rng_init(inputrec->ld_seed);
+ /* Copy the random seed set by the user */
+ seed = inputrec->ld_seed;
+ /* We use the frame step number as one random counter.
+ * The second counter use the insertion (step) count. But we
+ * need multiple random numbers per insertion. This number is
+ * not fixed, since we generate random locations in a sphere
+ * by putting locations in a cube and some of these fail.
+ * A count of 20 is already extremely unlikely, so 10000 is
+ * a safe margin for random numbers per insertion.
+ */
+ rnd_count_stride = 10000;
if (MASTER(cr))
{
nbin = 10;
snew(bin, nbin);
+ /* Avoid frame step numbers <= -1 */
+ frame_step_prev = -1;
+
bNotLastFrame = read_first_frame(oenv, &status, opt2fn("-rerun", nfile, fnm),
&rerun_fr, TRX_NEED_X);
frame = 0;
refvolshift = log(det(rerun_fr.box));
+ switch (inputrec->eI)
+ {
+ case eiTPI:
+ stepblocksize = inputrec->nstlist;
+ break;
+ case eiTPIC:
+ stepblocksize = 1;
+ break;
+ default:
+ gmx_fatal(FARGS, "Unknown integrator %s", ei_names[inputrec->eI]);
+ }
+
#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
/* Make sure we don't detect SSE overflow generated before this point */
gmx_mm_check_and_reset_overflow();
while (bNotLastFrame)
{
+ frame_step = rerun_fr.step;
+ if (frame_step <= frame_step_prev)
+ {
+ /* We don't have step number in the trajectory file,
+ * or we have constant or decreasing step numbers.
+ * Ensure we have increasing step numbers, since we use
+ * the step numbers as a counter for random numbers.
+ */
+ frame_step = frame_step_prev + 1;
+ }
+ frame_step_prev = frame_step;
+
lambda = rerun_fr.lambda;
t = rerun_fr.time;
bStateChanged = TRUE;
bNS = TRUE;
- for (step = 0; step < nsteps; step++)
+
+ step = cr->nodeid*stepblocksize;
+ while (step < nsteps)
{
- /* In parallel all nodes generate all random configurations.
- * In that way the result is identical to a single cpu tpi run.
+ /* Initialize the second counter for random numbers using
+ * the insertion step index. This ensures that we get
+ * the same random numbers independently of how many
+ * MPI ranks we use. Also for the same seed, we get
+ * the same initial random sequence for different nsteps.
*/
+ rnd_count = step*rnd_count_stride;
+
if (!bCavity)
{
/* Random insertion in the whole volume */
if (bNS)
{
/* Generate a random position in the box */
- x_init[XX] = gmx_rng_uniform_real(tpi_rand)*state->box[XX][XX];
- x_init[YY] = gmx_rng_uniform_real(tpi_rand)*state->box[YY][YY];
- x_init[ZZ] = gmx_rng_uniform_real(tpi_rand)*state->box[ZZ][ZZ];
+ gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd);
+ gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2);
+ for (d = 0; d < DIM; d++)
+ {
+ x_init[d] = rnd[d]*state->box[d][d];
+ }
}
if (inputrec->nstlist == 1)
{
/* Generate coordinates within |dx|=drmax of x_init */
do
{
- dx[XX] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax;
- dx[YY] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax;
- dx[ZZ] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax;
+ gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd);
+ gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2);
+ for (d = 0; d < DIM; d++)
+ {
+ dx[d] = (2*rnd[d] - 1)*drmax;
+ }
}
while (norm2(dx) > drmax*drmax);
rvec_add(x_init, dx, x_tp);
/* Generate coordinates within |dx|=drmax of x_init */
do
{
- dx[XX] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax;
- dx[YY] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax;
- dx[ZZ] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax;
+ gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd);
+ gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2);
+ for (d = 0; d < DIM; d++)
+ {
+ dx[d] = (2*rnd[d] - 1)*drmax;
+ }
}
while (norm2(dx) > drmax*drmax);
rvec_add(x_init, dx, x_tp);
copy_rvec(x_mol[i-a_tp0], state->x[i]);
}
/* Rotate the molecule randomly */
+ gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd);
+ gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2);
rotate_conf(a_tp1-a_tp0, state->x+a_tp0, NULL,
- 2*M_PI*gmx_rng_uniform_real(tpi_rand),
- 2*M_PI*gmx_rng_uniform_real(tpi_rand),
- 2*M_PI*gmx_rng_uniform_real(tpi_rand));
+ 2*M_PI*rnd[0],
+ 2*M_PI*rnd[1],
+ 2*M_PI*rnd[2]);
/* Shift to the insertion location */
for (i = a_tp0; i < a_tp1; i++)
{
}
}
- /* Check if this insertion belongs to this node */
- bOurStep = TRUE;
- if (PAR(cr))
+ /* Clear some matrix variables */
+ clear_mat(force_vir);
+ clear_mat(shake_vir);
+ clear_mat(vir);
+ clear_mat(pres);
+
+ /* Set the charge group center of mass of the test particle */
+ copy_rvec(x_init, fr->cg_cm[top->cgs.nr-1]);
+
+ /* Calc energy (no forces) on new positions.
+ * Since we only need the intermolecular energy
+ * and the RF exclusion terms of the inserted molecule occur
+ * within a single charge group we can pass NULL for the graph.
+ * This also avoids shifts that would move charge groups
+ * out of the box.
+ *
+ * Some checks above ensure than we can not have
+ * twin-range interactions together with nstlist > 1,
+ * therefore we do not need to remember the LR energies.
+ */
+ /* Make do_force do a single node force calculation */
+ cr->nnodes = 1;
+ do_force(fplog, cr, inputrec,
+ step, nrnb, wcycle, top, &top_global->groups,
+ state->box, state->x, &state->hist,
+ f, force_vir, mdatoms, enerd, fcd,
+ state->lambda,
+ NULL, fr, NULL, mu_tot, t, NULL, NULL, FALSE,
+ GMX_FORCE_NONBONDED | GMX_FORCE_ENERGY |
+ (bNS ? GMX_FORCE_DYNAMICBOX | GMX_FORCE_NS | GMX_FORCE_DO_LR : 0) |
+ (bStateChanged ? GMX_FORCE_STATECHANGED : 0));
+ cr->nnodes = nnodes;
+ bStateChanged = FALSE;
+ bNS = FALSE;
+
+ /* Calculate long range corrections to pressure and energy */
+ calc_dispcorr(fplog, inputrec, fr, step, top_global->natoms, state->box,
+ lambda, pres, vir, &prescorr, &enercorr, &dvdlcorr);
+ /* figure out how to rearrange the next 4 lines MRS 8/4/2009 */
+ enerd->term[F_DISPCORR] = enercorr;
+ enerd->term[F_EPOT] += enercorr;
+ enerd->term[F_PRES] += prescorr;
+ enerd->term[F_DVDL_VDW] += dvdlcorr;
+
+ epot = enerd->term[F_EPOT];
+ bEnergyOutOfBounds = FALSE;
+#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
+ /* With SSE the energy can overflow, check for this */
+ if (gmx_mm_check_and_reset_overflow())
{
- switch (inputrec->eI)
+ if (debug)
{
- case eiTPI:
- bOurStep = ((step / inputrec->nstlist) % nnodes == cr->nodeid);
- break;
- case eiTPIC:
- bOurStep = (step % nnodes == cr->nodeid);
- break;
- default:
- gmx_fatal(FARGS, "Unknown integrator %s", ei_names[inputrec->eI]);
+ fprintf(debug, "Found an SSE overflow, assuming the energy is out of bounds\n");
}
+ bEnergyOutOfBounds = TRUE;
}
- if (bOurStep)
- {
- /* Clear some matrix variables */
- clear_mat(force_vir);
- clear_mat(shake_vir);
- clear_mat(vir);
- clear_mat(pres);
-
- /* Set the charge group center of mass of the test particle */
- copy_rvec(x_init, fr->cg_cm[top->cgs.nr-1]);
-
- /* Calc energy (no forces) on new positions.
- * Since we only need the intermolecular energy
- * and the RF exclusion terms of the inserted molecule occur
- * within a single charge group we can pass NULL for the graph.
- * This also avoids shifts that would move charge groups
- * out of the box.
- *
- * Some checks above ensure than we can not have
- * twin-range interactions together with nstlist > 1,
- * therefore we do not need to remember the LR energies.
- */
- /* Make do_force do a single node force calculation */
- cr->nnodes = 1;
- do_force(fplog, cr, inputrec,
- step, nrnb, wcycle, top, &top_global->groups,
- state->box, state->x, &state->hist,
- f, force_vir, mdatoms, enerd, fcd,
- state->lambda,
- NULL, fr, NULL, mu_tot, t, NULL, NULL, FALSE,
- GMX_FORCE_NONBONDED | GMX_FORCE_ENERGY |
- (bNS ? GMX_FORCE_DYNAMICBOX | GMX_FORCE_NS | GMX_FORCE_DO_LR : 0) |
- (bStateChanged ? GMX_FORCE_STATECHANGED : 0));
- cr->nnodes = nnodes;
- bStateChanged = FALSE;
- bNS = FALSE;
-
- /* Calculate long range corrections to pressure and energy */
- calc_dispcorr(fplog, inputrec, fr, step, top_global->natoms, state->box,
- lambda, pres, vir, &prescorr, &enercorr, &dvdlcorr);
- /* figure out how to rearrange the next 4 lines MRS 8/4/2009 */
- enerd->term[F_DISPCORR] = enercorr;
- enerd->term[F_EPOT] += enercorr;
- enerd->term[F_PRES] += prescorr;
- enerd->term[F_DVDL_VDW] += dvdlcorr;
-
- epot = enerd->term[F_EPOT];
- bEnergyOutOfBounds = FALSE;
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
- /* With SSE the energy can overflow, check for this */
- if (gmx_mm_check_and_reset_overflow())
- {
- if (debug)
- {
- fprintf(debug, "Found an SSE overflow, assuming the energy is out of bounds\n");
- }
- bEnergyOutOfBounds = TRUE;
- }
#endif
- /* If the compiler doesn't optimize this check away
- * we catch the NAN energies.
- * The epot>GMX_REAL_MAX check catches inf values,
- * which should nicely result in embU=0 through the exp below,
- * but it does not hurt to check anyhow.
- */
- /* Non-bonded Interaction usually diverge at r=0.
- * With tabulated interaction functions the first few entries
- * should be capped in a consistent fashion between
- * repulsion, dispersion and Coulomb to avoid accidental
- * negative values in the total energy.
- * The table generation code in tables.c does this.
- * With user tbales the user should take care of this.
- */
- if (epot != epot || epot > GMX_REAL_MAX)
+ /* If the compiler doesn't optimize this check away
+ * we catch the NAN energies.
+ * The epot>GMX_REAL_MAX check catches inf values,
+ * which should nicely result in embU=0 through the exp below,
+ * but it does not hurt to check anyhow.
+ */
+ /* Non-bonded Interaction usually diverge at r=0.
+ * With tabulated interaction functions the first few entries
+ * should be capped in a consistent fashion between
+ * repulsion, dispersion and Coulomb to avoid accidental
+ * negative values in the total energy.
+ * The table generation code in tables.c does this.
+ * With user tbales the user should take care of this.
+ */
+ if (epot != epot || epot > GMX_REAL_MAX)
+ {
+ bEnergyOutOfBounds = TRUE;
+ }
+ if (bEnergyOutOfBounds)
+ {
+ if (debug)
{
- bEnergyOutOfBounds = TRUE;
+ fprintf(debug, "\n time %.3f, step %d: non-finite energy %f, using exp(-bU)=0\n", t, (int)step, epot);
}
- if (bEnergyOutOfBounds)
+ embU = 0;
+ }
+ else
+ {
+ embU = exp(-beta*epot);
+ sum_embU += embU;
+ /* Determine the weighted energy contributions of each energy group */
+ e = 0;
+ sum_UgembU[e++] += epot*embU;
+ if (fr->bBHAM)
{
- if (debug)
+ for (i = 0; i < ngid; i++)
{
- fprintf(debug, "\n time %.3f, step %d: non-finite energy %f, using exp(-bU)=0\n", t, step, epot);
+ sum_UgembU[e++] +=
+ (enerd->grpp.ener[egBHAMSR][GID(i, gid_tp, ngid)] +
+ enerd->grpp.ener[egBHAMLR][GID(i, gid_tp, ngid)])*embU;
}
- embU = 0;
}
else
{
- embU = exp(-beta*epot);
- sum_embU += embU;
- /* Determine the weighted energy contributions of each energy group */
- e = 0;
- sum_UgembU[e++] += epot*embU;
- if (fr->bBHAM)
- {
- for (i = 0; i < ngid; i++)
- {
- sum_UgembU[e++] +=
- (enerd->grpp.ener[egBHAMSR][GID(i, gid_tp, ngid)] +
- enerd->grpp.ener[egBHAMLR][GID(i, gid_tp, ngid)])*embU;
- }
- }
- else
- {
- for (i = 0; i < ngid; i++)
- {
- sum_UgembU[e++] +=
- (enerd->grpp.ener[egLJSR][GID(i, gid_tp, ngid)] +
- enerd->grpp.ener[egLJLR][GID(i, gid_tp, ngid)])*embU;
- }
- }
- if (bDispCorr)
+ for (i = 0; i < ngid; i++)
{
- sum_UgembU[e++] += enerd->term[F_DISPCORR]*embU;
- }
- if (bCharge)
- {
- for (i = 0; i < ngid; i++)
- {
- sum_UgembU[e++] +=
- (enerd->grpp.ener[egCOULSR][GID(i, gid_tp, ngid)] +
- enerd->grpp.ener[egCOULLR][GID(i, gid_tp, ngid)])*embU;
- }
- if (bRFExcl)
- {
- sum_UgembU[e++] += enerd->term[F_RF_EXCL]*embU;
- }
- if (EEL_FULL(fr->eeltype))
- {
- sum_UgembU[e++] += enerd->term[F_COUL_RECIP]*embU;
- }
+ sum_UgembU[e++] +=
+ (enerd->grpp.ener[egLJSR][GID(i, gid_tp, ngid)] +
+ enerd->grpp.ener[egLJLR][GID(i, gid_tp, ngid)])*embU;
}
}
-
- if (embU == 0 || beta*epot > bU_bin_limit)
+ if (bDispCorr)
{
- bin[0]++;
+ sum_UgembU[e++] += enerd->term[F_DISPCORR]*embU;
}
- else
+ if (bCharge)
{
- i = (int)((bU_logV_bin_limit
- - (beta*epot - logV + refvolshift))*invbinw
- + 0.5);
- if (i < 0)
+ for (i = 0; i < ngid; i++)
+ {
+ sum_UgembU[e++] +=
+ (enerd->grpp.ener[egCOULSR][GID(i, gid_tp, ngid)] +
+ enerd->grpp.ener[egCOULLR][GID(i, gid_tp, ngid)])*embU;
+ }
+ if (bRFExcl)
{
- i = 0;
+ sum_UgembU[e++] += enerd->term[F_RF_EXCL]*embU;
}
- if (i >= nbin)
+ if (EEL_FULL(fr->eeltype))
{
- realloc_bins(&bin, &nbin, i+10);
+ sum_UgembU[e++] += enerd->term[F_COUL_RECIP]*embU;
}
- bin[i]++;
}
+ }
- if (debug)
+ if (embU == 0 || beta*epot > bU_bin_limit)
+ {
+ bin[0]++;
+ }
+ else
+ {
+ i = (int)((bU_logV_bin_limit
+ - (beta*epot - logV + refvolshift))*invbinw
+ + 0.5);
+ if (i < 0)
{
- fprintf(debug, "TPI %7d %12.5e %12.5f %12.5f %12.5f\n",
- step, epot, x_tp[XX], x_tp[YY], x_tp[ZZ]);
+ i = 0;
}
-
- if (dump_pdb && epot <= dump_ener)
+ if (i >= nbin)
{
- sprintf(str, "t%g_step%d.pdb", t, step);
- sprintf(str2, "t: %f step %d ener: %f", t, step, epot);
- write_sto_conf_mtop(str, str2, top_global, state->x, state->v,
- inputrec->ePBC, state->box);
+ realloc_bins(&bin, &nbin, i+10);
}
+ bin[i]++;
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "TPI %7d %12.5e %12.5f %12.5f %12.5f\n",
+ (int)step, epot, x_tp[XX], x_tp[YY], x_tp[ZZ]);
+ }
+
+ if (dump_pdb && epot <= dump_ener)
+ {
+ sprintf(str, "t%g_step%d.pdb", t, (int)step);
+ sprintf(str2, "t: %f step %d ener: %f", t, (int)step, epot);
+ write_sto_conf_mtop(str, str2, top_global, state->x, state->v,
+ inputrec->ePBC, state->box);
+ }
+
+ step++;
+ if ((step/stepblocksize) % cr->nnodes != cr->nodeid)
+ {
+ /* Skip all steps assigned to the other MPI ranks */
+ step += (cr->nnodes - 1)*stepblocksize;
}
}
} gmx_sd_sigma_t;
typedef struct {
- /* The random state for ngaussrand threads.
- * Normal thermostats need just 1 random number generator,
- * but SD and BD with OpenMP parallelization need 1 for each thread.
- */
- int ngaussrand;
- gmx_rng_t *gaussrand;
/* BD stuff */
real *bd_rf;
/* SD stuff */
rvec *xp;
int xp_nalloc;
- /* variable size arrays for andersen */
- gmx_bool *randatom;
- int *randatom_list;
- gmx_bool randatom_list_init;
-
/* Variables for the deform algorithm */
gmx_int64_t deformref_step;
matrix deformref_box;
}
}
-/* Allocates and initializes sd->gaussrand[i] for i=1, i<sd->ngaussrand,
- * Using seeds generated from sd->gaussrand[0].
- */
-static void init_multiple_gaussrand(gmx_stochd_t *sd)
-{
- int ngr, i;
- unsigned int *seed;
-
- ngr = sd->ngaussrand;
- snew(seed, ngr);
-
- for (i = 1; i < ngr; i++)
- {
- seed[i] = gmx_rng_uniform_uint32(sd->gaussrand[0]);
- }
-
- if (ngr != gmx_omp_nthreads_get(emntUpdate))
- {
- gmx_incons("The number of Gaussian number generators should be equal to gmx_omp_nthreads_get(emntUpdate)");
- }
-
-#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
- {
- int th;
-
- th = gmx_omp_get_thread_num();
- if (th > 0)
- {
- /* Initialize on each thread to get memory allocated thread-local */
- sd->gaussrand[th] = gmx_rng_init(seed[th]);
- }
- }
-
- sfree(seed);
-}
-
-static gmx_stochd_t *init_stochd(t_inputrec *ir, int nthreads)
+static gmx_stochd_t *init_stochd(t_inputrec *ir)
{
gmx_stochd_t *sd;
gmx_sd_const_t *sdc;
snew(sd, 1);
- /* Initiate random number generator for langevin type dynamics,
- * for BD, SD or velocity rescaling temperature coupling.
- */
- if (ir->eI == eiBD || EI_SD(ir->eI))
- {
- sd->ngaussrand = nthreads;
- }
- else
- {
- sd->ngaussrand = 1;
- }
- snew(sd->gaussrand, sd->ngaussrand);
-
- /* Initialize the first random generator */
- sd->gaussrand[0] = gmx_rng_init(ir->ld_seed);
-
- if (sd->ngaussrand > 1)
- {
- /* Initialize the rest of the random number generators,
- * using the first one to generate seeds.
- */
- init_multiple_gaussrand(sd);
- }
-
ngtc = ir->opts.ngtc;
if (ir->eI == eiBD)
return sd;
}
-void get_stochd_state(gmx_update_t upd, t_state *state)
-{
- /* Note that we only get the state of the first random generator,
- * even if there are multiple. This avoids repetition.
- */
- gmx_rng_get_state(upd->sd->gaussrand[0], state->ld_rng, state->ld_rngi);
-}
-
-void set_stochd_state(gmx_update_t upd, t_state *state)
-{
- gmx_stochd_t *sd;
- int i;
-
- sd = upd->sd;
-
- gmx_rng_set_state(sd->gaussrand[0], state->ld_rng, state->ld_rngi[0]);
-
- if (sd->ngaussrand > 1)
- {
- /* We only end up here with SD or BD with OpenMP.
- * Destroy and reinitialize the rest of the random number generators,
- * using seeds generated from the first one.
- * Although this doesn't recover the previous state,
- * it at least avoids repetition, which is most important.
- * Exaclty restoring states with all MPI+OpenMP setups is difficult
- * and as the integrator is random to start with, doesn't gain us much.
- */
- for (i = 1; i < sd->ngaussrand; i++)
- {
- gmx_rng_destroy(sd->gaussrand[i]);
- }
-
- init_multiple_gaussrand(sd);
- }
-}
-
gmx_update_t init_update(t_inputrec *ir)
{
t_gmx_update *upd;
if (ir->eI == eiBD || EI_SD(ir->eI) || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc))
{
- upd->sd = init_stochd(ir, gmx_omp_nthreads_get(emntUpdate));
+ upd->sd = init_stochd(ir);
}
- upd->xp = NULL;
- upd->xp_nalloc = 0;
- upd->randatom = NULL;
- upd->randatom_list = NULL;
- upd->randatom_list_init = FALSE; /* we have not yet cleared the data structure at this point */
+ upd->xp = NULL;
+ upd->xp_nalloc = 0;
return upd;
}
static void do_update_sd1(gmx_stochd_t *sd,
- gmx_rng_t gaussrand,
int start, int nrend, double dt,
rvec accel[], ivec nFreeze[],
real invmass[], unsigned short ptype[],
unsigned short cFREEZE[], unsigned short cACC[],
unsigned short cTC[],
rvec x[], rvec xprime[], rvec v[], rvec f[],
- int ngtc, real tau_t[], real ref_t[])
+ int ngtc, real tau_t[], real ref_t[],
+ gmx_int64_t step, int seed, int* gatindex)
{
gmx_sd_const_t *sdc;
gmx_sd_sigma_t *sig;
for (n = start; n < nrend; n++)
{
+ real rnd[3];
+ int ng = gatindex ? gatindex[n] : n;
ism = sqrt(invmass[n]);
if (cFREEZE)
{
gt = cTC[n];
}
+ gmx_rng_cycle_3gaussian_table(step, ng, seed, RND_SEED_UPDATE, rnd);
for (d = 0; d < DIM; d++)
{
if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
{
- sd_V = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand);
+ sd_V = ism*sig[gt].V*rnd[d];
v[n][d] = v[n][d]*sdc[gt].em
+ (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em)
}
static void do_update_sd2(gmx_stochd_t *sd,
- gmx_rng_t gaussrand,
gmx_bool bInitStep,
int start, int nrend,
rvec accel[], ivec nFreeze[],
rvec x[], rvec xprime[], rvec v[], rvec f[],
rvec sd_X[],
const real tau_t[],
- gmx_bool bFirstHalf)
+ gmx_bool bFirstHalf, gmx_int64_t step, int seed,
+ int* gatindex)
{
gmx_sd_const_t *sdc;
gmx_sd_sigma_t *sig;
int gf = 0, ga = 0, gt = 0;
real vn = 0, Vmh, Xmh;
real ism;
- int n, d;
+ int n, d, ng;
sdc = sd->sdc;
sig = sd->sdsig;
for (n = start; n < nrend; n++)
{
+ real rnd[6], rndi[3];
+ ng = gatindex ? gatindex[n] : n;
ism = sqrt(invmass[n]);
if (cFREEZE)
{
gt = cTC[n];
}
+ gmx_rng_cycle_6gaussian_table(step*2+(bFirstHalf ? 1 : 2), ng, seed, RND_SEED_UPDATE, rnd);
+ if (bInitStep)
+ {
+ gmx_rng_cycle_3gaussian_table(step*2, ng, seed, RND_SEED_UPDATE, rndi);
+ }
for (d = 0; d < DIM; d++)
{
if (bFirstHalf)
{
if (bInitStep)
{
- sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand);
+ sd_X[n][d] = ism*sig[gt].X*rndi[d];
}
Vmh = sd_X[n][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c)
- + ism*sig[gt].Yv*gmx_rng_gaussian_table(gaussrand);
- sd_V[n][d] = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand);
+ + ism*sig[gt].Yv*rnd[d*2];
+ sd_V[n][d] = ism*sig[gt].V*rnd[d*2+1];
v[n][d] = vn*sdc[gt].em
+ (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em)
}
else
{
-
/* Correct the velocities for the constraints.
* This operation introduces some inaccuracy,
* since the velocity is determined from differences in coordinates.
(xprime[n][d] - x[n][d])/(tau_t[gt]*(sdc[gt].eph - sdc[gt].emh));
Xmh = sd_V[n][d]*tau_t[gt]*sdc[gt].d/(sdc[gt].em-1)
- + ism*sig[gt].Yx*gmx_rng_gaussian_table(gaussrand);
- sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand);
+ + ism*sig[gt].Yx*rnd[d*2];
+ sd_X[n][d] = ism*sig[gt].X*rnd[d*2+1];
xprime[n][d] += sd_X[n][d] - Xmh;
unsigned short cFREEZE[], unsigned short cTC[],
rvec x[], rvec xprime[], rvec v[],
rvec f[], real friction_coefficient,
- real *rf, gmx_rng_t gaussrand)
+ real *rf, gmx_int64_t step, int seed,
+ int* gatindex)
{
/* note -- these appear to be full step velocities . . . */
int gf = 0, gt = 0;
for (n = start; (n < nrend); n++)
{
+ real rnd[3];
+ int ng = gatindex ? gatindex[n] : n;
+
if (cFREEZE)
{
gf = cFREEZE[n];
{
gt = cTC[n];
}
+ gmx_rng_cycle_3gaussian_table(step, ng, seed, RND_SEED_UPDATE, rnd);
for (d = 0; (d < DIM); d++)
{
if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
{
if (friction_coefficient != 0)
{
- vn = invfr*f[n][d] + rf[gt]*gmx_rng_gaussian_table(gaussrand);
+ vn = invfr*f[n][d] + rf[gt]*rnd[d];
}
else
{
/* NOTE: invmass = 2/(mass*friction_constant*dt) */
vn = 0.5*invmass[n]*f[n][d]*dt
- + sqrt(0.5*invmass[n])*rf[gt]*gmx_rng_gaussian_table(gaussrand);
+ + sqrt(0.5*invmass[n])*rf[gt]*rnd[d];
}
v[n][d] = vn;
t_inputrec *inputrec,
t_state *state,
gmx_ekindata_t *ekind,
- gmx_update_t upd,
t_extmass *MassQ,
t_mdatoms *md)
state->nosehoover_xi, state->nosehoover_vxi, MassQ);
break;
case etcVRESCALE:
- vrescale_tcoupl(inputrec, ekind, dttc,
- state->therm_integral, upd->sd->gaussrand[0]);
+ vrescale_tcoupl(inputrec, step, ekind, dttc,
+ state->therm_integral);
break;
}
/* rescale in place here */
end_th = start + ((nrend-start)*(th+1))/nth;
/* The second part of the SD integration */
- do_update_sd2(upd->sd, upd->sd->gaussrand[th],
+ do_update_sd2(upd->sd,
FALSE, start_th, end_th,
inputrec->opts.acc, inputrec->opts.nFreeze,
md->invmass, md->ptype,
md->cFREEZE, md->cACC, md->cTC,
state->x, xprime, state->v, force, state->sd_X,
inputrec->opts.tau_t,
- FALSE);
+ FALSE, step, inputrec->ld_seed,
+ DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
}
inc_nrnb(nrnb, eNR_UPDATE, homenr);
}
break;
case (eiSD1):
- do_update_sd1(upd->sd, upd->sd->gaussrand[th],
+ do_update_sd1(upd->sd,
start_th, end_th, dt,
inputrec->opts.acc, inputrec->opts.nFreeze,
md->invmass, md->ptype,
md->cFREEZE, md->cACC, md->cTC,
state->x, xprime, state->v, force,
- inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t);
+ inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t,
+ step, inputrec->ld_seed, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
break;
case (eiSD2):
/* The SD update is done in 2 parts, because an extra constraint step
* is needed
*/
- do_update_sd2(upd->sd, upd->sd->gaussrand[th],
+ do_update_sd2(upd->sd,
bInitStep, start_th, end_th,
inputrec->opts.acc, inputrec->opts.nFreeze,
md->invmass, md->ptype,
md->cFREEZE, md->cACC, md->cTC,
state->x, xprime, state->v, force, state->sd_X,
inputrec->opts.tau_t,
- TRUE);
+ TRUE, step, inputrec->ld_seed,
+ DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
break;
case (eiBD):
do_update_bd(start_th, end_th, dt,
md->cFREEZE, md->cTC,
state->x, xprime, state->v, force,
inputrec->bd_fric,
- upd->sd->bd_rf, upd->sd->gaussrand[th]);
+ upd->sd->bd_rf,
+ step, inputrec->ld_seed, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
break;
case (eiVV):
case (eiVVAK):
mv[XX], mv[YY], mv[ZZ]);
}
-extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_int64_t step, t_mdatoms *md, t_state *state, gmx_update_t upd, t_idef *idef, gmx_constr_t constr)
+extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_int64_t step, const t_commrec *cr,
+ t_mdatoms *md, t_state *state, gmx_update_t upd, gmx_constr_t constr)
{
int i;
real rate = (ir->delta_t)/ir->opts.tau_t[0];
+
+ if (ir->etc == etcANDERSEN && constr != NULL)
+ {
+ gmx_fatal(FARGS, "Normal Andersen is currently not supported with constraints, use massive Andersen instead");
+ }
+
/* proceed with andersen if 1) it's fixed probability per
particle andersen or 2) it's massive andersen and it's tau_t/dt */
if ((ir->etc == etcANDERSEN) || do_per_step(step, (int)(1.0/rate)))
{
- srenew(upd->randatom, state->nalloc);
- srenew(upd->randatom_list, state->nalloc);
- if (upd->randatom_list_init == FALSE)
- {
- for (i = 0; i < state->nalloc; i++)
- {
- upd->randatom[i] = FALSE;
- upd->randatom_list[i] = 0;
- }
- upd->randatom_list_init = TRUE;
- }
- andersen_tcoupl(ir, md, state, upd->sd->gaussrand[0], rate,
- (ir->etc == etcANDERSEN) ? idef : NULL,
- constr ? get_nblocks(constr) : 0,
- constr ? get_sblock(constr) : NULL,
- upd->randatom, upd->randatom_list,
+ andersen_tcoupl(ir, step, cr, md, state, rate,
upd->sd->randomize_group, upd->sd->boltzfac);
return TRUE;
}
#include <process.h>
#endif
+#include "external/Random123-1.08/include/Random123/threefry.h"
+
#include "gromacs/math/utilities.h"
#include "random_gausstable.h"
#define RNG_UPPER_MASK 0x80000000UL /* most significant w-r bits */
#define RNG_LOWER_MASK 0x7fffffffUL /* least significant r bits */
-/* Note that if you change the size of the Gaussian table you will also
- * have to generate new initialization data for the table in
+/* Note that if you change the size of the Gaussian table you will
+ * also have to generate new initialization data for the table in
* gmx_random_gausstable.h
*
* A routine print_gaussian_table() is in contrib/random.c
* for convenience - use it if you need a different size of the table.
*/
#define GAUSS_TABLE 14 /* the size of the gauss table is 2^GAUSS_TABLE */
-#define GAUSS_SHIFT (32 - GAUSS_TABLE)
+#define GAUSS_MASK ((1 << GAUSS_TABLE) - 1)
struct gmx_rng {
* we are limited to an accuracy of 1e-7.
*/
}
-
-
-
real
+
gmx_rng_gaussian_table(gmx_rng_t rng)
{
unsigned int i;
i = gmx_rng_uniform_uint32(rng);
/* The Gaussian table is a static constant in this file */
- return gaussian_table[i >> GAUSS_SHIFT];
+ return gaussian_table[i >> (32 - GAUSS_TABLE)];
+}
+
+void
+gmx_rng_cycle_2uniform(gmx_int64_t ctr1, gmx_int64_t ctr2,
+ gmx_int64_t key1, gmx_int64_t key2,
+ double* rnd)
+{
+ const gmx_int64_t mask_53bits = 0x1FFFFFFFFFFFFF;
+ const double two_power_min53 = 1.0/9007199254740992.0;
+
+ threefry2x64_ctr_t ctr = {{ctr1, ctr2}};
+ threefry2x64_key_t key = {{key1, key2}};
+ threefry2x64_ctr_t rand = threefry2x64(ctr, key);
+
+ rnd[0] = (rand.v[0] & mask_53bits)*two_power_min53;
+ rnd[1] = (rand.v[1] & mask_53bits)*two_power_min53;
+}
+
+void
+gmx_rng_cycle_3gaussian_table(gmx_int64_t ctr1, gmx_int64_t ctr2,
+ gmx_int64_t key1, gmx_int64_t key2,
+ real* rnd)
+{
+ threefry2x64_ctr_t ctr = {{ctr1, ctr2}};
+ threefry2x64_key_t key = {{key1, key2}};
+ threefry2x64_ctr_t rand = threefry2x64(ctr, key);
+
+ rnd[0] = gaussian_table[(rand.v[0] >> 48) & GAUSS_MASK];
+ rnd[1] = gaussian_table[(rand.v[0] >> 32) & GAUSS_MASK];
+ rnd[2] = gaussian_table[(rand.v[0] >> 16) & GAUSS_MASK];
+}
+
+void
+gmx_rng_cycle_6gaussian_table(gmx_int64_t ctr1, gmx_int64_t ctr2,
+ gmx_int64_t key1, gmx_int64_t key2,
+ real* rnd)
+{
+ threefry2x64_ctr_t ctr = {{ctr1, ctr2}};
+ threefry2x64_key_t key = {{key1, key2}};
+ threefry2x64_ctr_t rand = threefry2x64(ctr, key);
+
+ rnd[0] = gaussian_table[(rand.v[0] >> 48) & GAUSS_MASK];
+ rnd[1] = gaussian_table[(rand.v[0] >> 32) & GAUSS_MASK];
+ rnd[2] = gaussian_table[(rand.v[0] >> 16) & GAUSS_MASK];
+ rnd[3] = gaussian_table[(rand.v[1] >> 48) & GAUSS_MASK];
+ rnd[4] = gaussian_table[(rand.v[1] >> 32) & GAUSS_MASK];
+ rnd[5] = gaussian_table[(rand.v[1] >> 16) & GAUSS_MASK];
}
extern "C" {
#endif
+/*! Fixed random number seeds for different types of RNG */
+#define RND_SEED_UPDATE 1 /*!< For coordinate update (sd, bd, ..) */
+#define RND_SEED_REPLEX 2 /*!< For replica exchange */
+#define RND_SEED_VRESCALE 3 /*!< For V-rescale thermostat */
+#define RND_SEED_ANDERSEN 4 /*!< For Andersen thermostat */
+#define RND_SEED_TPI 5 /*!< For test particle insertion */
+#define RND_SEED_EXPANDED 6 /*!< For expanded emseble methods */
+
/*! \brief Abstract datatype for a random number generator
*
* This is a handle to the full state of a random number generator.
real
gmx_rng_gaussian_table(gmx_rng_t rng);
+
+/* The stateless cycle based random number generators below,
+ * which all use threefry2x64, take the following arguments:
+ *
+ * ctr1: In mdrun the step counter, in tools the frame(-step)
+ * counter, so we can ensure reproducible results, even
+ * we starting at different steps/frames. Might need to be
+ * multiplied by a constant if we need more random numbers.
+ * ctr2: A local counter, in mdrun often a global atom index.
+ * If any algorithm needs a variable number of random numbers,
+ * the second counter is usually a function of the local
+ * counter.
+ * key1: A user provided random seed.
+ * key2: A fixed seed which is particular for the algorithm,
+ * as defined at the top of this file, to ensure different
+ * random sequences when the same user seed is used for
+ * different algorithms.
+ */
+
+/* Return two uniform random numbers with 2^53 equally
+ * probable values between 0 and 1 - 2^-53.
+ * It uses a stateless counter based random number generator
+ * (threefry2x64).
+ */
+void
+gmx_rng_cycle_2uniform(gmx_int64_t ctr1, gmx_int64_t ctr2,
+ gmx_int64_t key1, gmx_int64_t key2,
+ double* rnd);
+
+/* Return three Gaussian random numbers with expectation value
+ * 0.0 and standard deviation 1.0. This routine uses a table
+ * lookup for maximum speed. It uses a stateless counter
+ * based random number generator (threefry2x64). See
+ * gmx_rng_gaussian_table for a warning about accuracy of the table.
+ *
+ * threadsafe: yes
+ */
+void
+gmx_rng_cycle_3gaussian_table(gmx_int64_t ctr1, gmx_int64_t ctr2,
+ gmx_int64_t key1, gmx_int64_t key2,
+ real* rnd);
+
+/* As gmx_rng_3gaussian_table, but returns 6 Gaussian numbers. */
+void
+gmx_rng_cycle_6gaussian_table(gmx_int64_t ctr1, gmx_int64_t ctr2,
+ gmx_int64_t key1, gmx_int64_t key2,
+ real* rnd);
+
#ifdef __cplusplus
}
#endif
#include "gromacs/commandline/pargs.h"
#include "vec.h"
#include "mtop_util.h"
-#include "gromacs/random/random.h"
#include "checkpoint.h"
#include "gromacs/fileio/tpxio.h"
#include "gromacs/fileio/trnio.h"
"If you want that, supply a checkpoint file to mdrun\n\n");
}
- if (EI_SD(ir->eI) || ir->eI == eiBD)
- {
- fprintf(stderr, "\nChanging ld-seed from %d ", ir->ld_seed);
- ir->ld_seed = (int)gmx_rng_make_seed();
- fprintf(stderr, "to %d\n\n", ir->ld_seed);
- }
-
frame_fn = ftp2fn(efTRN, NFILE, fnm);
if (fn2ftp(frame_fn) == efCPT)
gmx_update_t upd = NULL;
t_graph *graph = NULL;
globsig_t gs;
- gmx_rng_t mcrng = NULL;
gmx_groups_t *groups;
gmx_ekindata_t *ekind, *ekind_save;
gmx_shellfc_t shellfc;
if (ir->bExpanded)
{
- init_expanded_ensemble(bStateFromCP, ir, &mcrng, &state->dfhist);
+ init_expanded_ensemble(bStateFromCP, ir, &state->dfhist);
}
if (MASTER(cr))
update_energyhistory(&state_global->enerhist, mdebin);
}
- if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG))
- {
- /* Set the random state if we read a checkpoint file */
- set_stochd_state(upd, state);
- }
-
- if (state->flags & (1<<estMC_RNG))
- {
- set_mc_state(mcrng, state);
- }
-
/* Initialize constraints */
if (constr && !DOMAINDECOMP(cr))
{
statistics, but if performing simulated tempering, we
do update the velocities and the tau_t. */
- lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, &state->dfhist, step, mcrng, state->v, mdatoms);
+ lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, &state->dfhist, step, state->v, mdatoms);
/* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
copy_df_history(&state_global->dfhist, &state->dfhist);
}
* the update.
*/
do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
- ir, state, state_global, top_global, fr, upd,
+ ir, state, state_global, top_global, fr,
outf, mdebin, ekind, f, f_global,
- wcycle, mcrng, &nchkpt,
+ wcycle, &nchkpt,
bCPT, bRerunMD, bLastStep, (Flags & MD_CONFOUT),
bSumEkinhOld);
{
if (!bInitStep)
{
- update_tcouple(step, ir, state, ekind, upd, &MassQ, mdatoms);
+ update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
}
if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
{
gmx_bool bIfRandomize;
- bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr);
+ bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state, upd, constr);
/* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
if (constr && bIfRandomize)
{
}
else
{
- update_tcouple(step, ir, state, ekind, upd, &MassQ, mdatoms);
+ update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
}
#include "vec.h"
#include "names.h"
#include "domdec.h"
+#include "gromacs/random/random.h"
#define PROBABILITYCUTOFF 100
/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
if (bMultiEx)
{
/* multiple random switch exchange */
- for (i = 0; i < re->nex; i++)
+ int nself = 0;
+ for (i = 0; i < re->nex + nself; i++)
{
+ double rnd[2];
+
+ gmx_rng_cycle_2uniform(step, i*2, re->seed, RND_SEED_REPLEX, rnd);
/* randomly select a pair */
/* in theory, could reduce this by identifying only which switches had a nonneglibible
probability of occurring (log p > -100) and only operate on those switches */
/* find out which state it is from, and what label that state currently has. Likely
more work that useful. */
- i0 = (int)(re->nrepl*gmx_rng_uniform_real(re->rng));
- i1 = (int)(re->nrepl*gmx_rng_uniform_real(re->rng));
+ i0 = (int)(re->nrepl*rnd[0]);
+ i1 = (int)(re->nrepl*rnd[1]);
if (i0 == i1)
{
- i--;
+ nself++;
continue; /* self-exchange, back up and do it again */
}
prob[0] = exp(-delta);
}
/* roll a number to determine if accepted */
- bEx[0] = (gmx_rng_uniform_real(re->rng) < prob[0]);
+ gmx_rng_cycle_2uniform(step, i*2+1, re->seed, RND_SEED_REPLEX, rnd);
+ bEx[0] = rnd[0] < prob[0];
}
re->prob_sum[0] += prob[0];
else
{
/* standard nearest neighbor replica exchange */
+
m = (step / re->nst) % 2;
for (i = 1; i < re->nrepl; i++)
{
}
else
{
+ double rnd[2];
+
if (delta > PROBABILITYCUTOFF)
{
prob[i] = 0;
prob[i] = exp(-delta);
}
/* roll a number to determine if accepted */
- bEx[i] = (gmx_rng_uniform_real(re->rng) < prob[i]);
+ gmx_rng_cycle_2uniform(step, i, re->seed, RND_SEED_REPLEX, rnd);
+ bEx[i] = rnd[0] < prob[i];
}
re->prob_sum[i] += prob[i];
int i, m, nChargePerturbed = -1, nTypePerturbed = 0, status, nalloc;
char *gro;
gmx_wallcycle_t wcycle;
- gmx_bool bReadRNG, bReadEkin;
+ gmx_bool bReadEkin;
int list;
gmx_walltime_accounting_t walltime_accounting = NULL;
int rc;
}
/* now make sure the state is initialized and propagated */
- set_state_entries(state, inputrec, cr->nnodes);
+ set_state_entries(state, inputrec);
/* A parallel command line option consistency check that we can
only do after any threads have started. */
{
load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
cr, ddxyz,
- inputrec, state, &bReadRNG, &bReadEkin,
+ inputrec, state, &bReadEkin,
(Flags & MD_APPENDFILES),
(Flags & MD_APPENDFILESSET));
- if (bReadRNG)
- {
- Flags |= MD_READ_RNG;
- }
if (bReadEkin)
{
Flags |= MD_READ_EKIN;
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2013, by the GROMACS development team, led by
+# Copyright (c) 2013,2014, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
--suppress=invalidscanf
--suppress=sizeofCalculation
--suppress=missingInclude:src/programs/mdrun/gmx_gpu_utils/gmx_gpu_utils.cu
+ --suppress=*:src/external/Random123-1.08/include/Random123/features/compilerfeatures.h
--inline-suppr)
set(_cxx_flags
-D__cplusplus