From ef2d19b37cb9978512f3ce8f0a6385cc44244d4d Mon Sep 17 00:00:00 2001
From: Roland Schulz <roland@utk.edu>
Date: Mon, 20 Jan 2014 16:33:02 -0500
Subject: [PATCH] Replace all mdrun rngs with cycle based rng

The stateful random rumber generator (rng) used previously doesn't
produce reproducible results in parallel for sd/bd and doesn't
produce reproducible results for continutation for replica exchange.
The rng state has been removed from the checkpoint file.

Fixes #995

Change-Id: Id2a5d064cf363c54db3c16a0675cfeba553feeaa
---
 src/external/Random123-1.08/LICENSE           |  31 +
 .../Random123-1.08/include/Random123/array.h  | 326 +++++++
 .../Random123/features/clangfeatures.h        |  76 ++
 .../Random123/features/compilerfeatures.h     | 320 +++++++
 .../include/Random123/features/gccfeatures.h  | 247 +++++
 .../include/Random123/features/iccfeatures.h  | 208 +++++
 .../include/Random123/features/msvcfeatures.h | 200 ++++
 .../include/Random123/features/nvccfeatures.h | 110 +++
 .../Random123/features/open64features.h       |  50 +
 .../Random123/features/openclfeatures.h       |  89 ++
 .../include/Random123/features/pgccfeatures.h | 194 ++++
 .../include/Random123/features/sse.h          | 280 ++++++
 .../Random123/features/sunprofeatures.h       | 172 ++++
 .../include/Random123/features/xlcfeatures.h  | 202 ++++
 .../include/Random123/threefry.h              | 864 ++++++++++++++++++
 src/gromacs/fileio/trajectory_writing.c       |  10 -
 src/gromacs/fileio/trajectory_writing.h       |   3 -
 src/gromacs/gmxlib/checkpoint.c               |  78 +-
 src/gromacs/gmxlib/mvdata.c                   |  12 -
 src/gromacs/gmxlib/typedefs.c                 |  14 -
 src/gromacs/legacyheaders/checkpoint.h        |   5 +-
 src/gromacs/legacyheaders/mdrun.h             |  14 +-
 src/gromacs/legacyheaders/types/state.h       |  11 +-
 src/gromacs/legacyheaders/update.h            |  13 +-
 src/gromacs/mdlib/coupling.c                  | 301 ++----
 src/gromacs/mdlib/domdec.c                    |  59 --
 src/gromacs/mdlib/domdec_top.c                |  24 -
 src/gromacs/mdlib/expanded.c                  |  34 +-
 src/gromacs/mdlib/init.c                      |  34 +-
 src/gromacs/mdlib/minimize.c                  |   1 -
 src/gromacs/mdlib/sim_util.c                  |   1 -
 src/gromacs/mdlib/tpi.c                       | 394 ++++----
 src/gromacs/mdlib/update.c                    | 211 ++---
 src/gromacs/random/random.c                   |  61 +-
 src/gromacs/random/random.h                   |  56 ++
 src/gromacs/tools/convert_tpr.c               |   8 -
 src/programs/mdrun/md.c                       |  26 +-
 src/programs/mdrun/repl_ex.c                  |  22 +-
 src/programs/mdrun/runner.c                   |  10 +-
 tests/CppCheck.cmake                          |   3 +-
 40 files changed, 3918 insertions(+), 856 deletions(-)
 create mode 100644 src/external/Random123-1.08/LICENSE
 create mode 100644 src/external/Random123-1.08/include/Random123/array.h
 create mode 100644 src/external/Random123-1.08/include/Random123/features/clangfeatures.h
 create mode 100644 src/external/Random123-1.08/include/Random123/features/compilerfeatures.h
 create mode 100644 src/external/Random123-1.08/include/Random123/features/gccfeatures.h
 create mode 100644 src/external/Random123-1.08/include/Random123/features/iccfeatures.h
 create mode 100644 src/external/Random123-1.08/include/Random123/features/msvcfeatures.h
 create mode 100644 src/external/Random123-1.08/include/Random123/features/nvccfeatures.h
 create mode 100644 src/external/Random123-1.08/include/Random123/features/open64features.h
 create mode 100644 src/external/Random123-1.08/include/Random123/features/openclfeatures.h
 create mode 100644 src/external/Random123-1.08/include/Random123/features/pgccfeatures.h
 create mode 100644 src/external/Random123-1.08/include/Random123/features/sse.h
 create mode 100644 src/external/Random123-1.08/include/Random123/features/sunprofeatures.h
 create mode 100644 src/external/Random123-1.08/include/Random123/features/xlcfeatures.h
 create mode 100644 src/external/Random123-1.08/include/Random123/threefry.h

diff --git a/src/external/Random123-1.08/LICENSE b/src/external/Random123-1.08/LICENSE
new file mode 100644
index 0000000000..c6094acaf2
--- /dev/null
+++ b/src/external/Random123-1.08/LICENSE
@@ -0,0 +1,31 @@
+/** @page LICENSE
+Copyright 2010-2012, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/src/external/Random123-1.08/include/Random123/array.h b/src/external/Random123-1.08/include/Random123/array.h
new file mode 100644
index 0000000000..ab85392d8d
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/array.h
@@ -0,0 +1,326 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _r123array_dot_h__
+#define _r123array_dot_h__
+#include "features/compilerfeatures.h"
+#include "features/sse.h"
+
+#ifndef __cplusplus
+#define CXXMETHODS(_N, W, T)
+#define CXXOVERLOADS(_N, W, T)
+#else
+
+#include <stddef.h>
+#include <algorithm>
+#include <stdexcept>
+#include <iterator>
+#include <limits>
+#include <iostream>
+
+/** @defgroup arrayNxW The r123arrayNxW classes 
+
+    Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers.
+    It is functionally equivalent to the C++0x std::array<N, uintW_t>,
+    but does not require C++0x features or libraries.
+
+    In addition to meeting most of the requirements of a Container,
+    it also has a member function, incr(), which increments the zero-th
+    element and carrys overflows into higher indexed elements.  Thus,
+    by using incr(), sequences of up to 2^(N*W) distinct values
+    can be produced. 
+
+    If SSE is supported by the compiler, then the class
+    r123array1xm128i is also defined, in which the data member is an
+    array of one r123128i object.
+
+    @cond HIDDEN_FROM_DOXYGEN
+*/
+
+template <typename value_type>
+inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){
+    value_type v=0;
+    for(size_t i=0; i<(3+sizeof(value_type))/4; ++i)
+        v |= ((value_type)(*p32++)) << (32*i);
+    return v;
+}
+
+// Work-alike methods and typedefs modeled on std::array:
+#define CXXMETHODS(_N, W, T)                                            \
+    typedef T value_type;                                               \
+    typedef T* iterator;                                                \
+    typedef const T* const_iterator;                                    \
+    typedef value_type& reference;                                      \
+    typedef const value_type& const_reference;                          \
+    typedef size_t size_type;                                           \
+    typedef ptrdiff_t difference_type;                                  \
+    typedef T* pointer;                                                 \
+    typedef const T* const_pointer;                                     \
+    typedef std::reverse_iterator<iterator> reverse_iterator;           \
+    typedef std::reverse_iterator<const_iterator> const_reverse_iterator; \
+    /* Boost.array has static_size.  C++11 specializes tuple_size */    \
+    enum {static_size = _N};                                            \
+    R123_CUDA_DEVICE reference operator[](size_type i){return v[i];}                     \
+    R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];}        \
+    R123_CUDA_DEVICE reference at(size_type i){ if(i >=  _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
+    R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >=  _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
+    R123_CUDA_DEVICE size_type size() const { return  _N; }                              \
+    R123_CUDA_DEVICE size_type max_size() const { return _N; }                           \
+    R123_CUDA_DEVICE bool empty() const { return _N==0; };                               \
+    R123_CUDA_DEVICE iterator begin() { return &v[0]; }                                  \
+    R123_CUDA_DEVICE iterator end() { return &v[_N]; }                                   \
+    R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; }                      \
+    R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; }                       \
+    R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; }                     \
+    R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; }                      \
+    R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); }        \
+    R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \
+    R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); }        \
+    R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \
+    R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \
+    R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } \
+    R123_CUDA_DEVICE pointer data(){ return &v[0]; }                                     \
+    R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; }                         \
+    R123_CUDA_DEVICE reference front(){ return v[0]; }                                   \
+    R123_CUDA_DEVICE const_reference front() const{ return v[0]; }                       \
+    R123_CUDA_DEVICE reference back(){ return v[_N-1]; }                                 \
+    R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; }                     \
+    R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \
+	/* CUDA3 does not have std::equal */ \
+	for (size_t i = 0; i < _N; ++i) \
+	    if (v[i] != rhs.v[i]) return false; \
+	return true; \
+    } \
+    R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \
+    /* CUDA3 does not have std::fill_n */ \
+    R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \
+    R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \
+	/* CUDA3 does not have std::swap_ranges */ \
+	for (size_t i = 0; i < _N; ++i) { \
+	    T tmp = v[i]; \
+	    v[i] = rhs.v[i]; \
+	    rhs.v[i] = tmp; \
+	} \
+    } \
+    R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){                         \
+        /* This test is tricky because we're trying to avoid spurious   \
+           complaints about illegal shifts, yet still be compile-time   \
+           evaulated. */                                                \
+        if(sizeof(T)<sizeof(n) && n>>((sizeof(T)<sizeof(n))?8*sizeof(T):0) ) \
+            return incr_carefully(n);                                   \
+        if(n==1){                                                       \
+            ++v[0];                                                     \
+            if(_N==1 || R123_BUILTIN_EXPECT(!!v[0], 1)) return *this;   \
+        }else{                                                          \
+            v[0] += n;                                                  \
+            if(_N==1 || R123_BUILTIN_EXPECT(n<=v[0], 1)) return *this;  \
+        }                                                               \
+        /* We expect that the N==?? tests will be                       \
+           constant-folded/optimized away by the compiler, so only the  \
+           overflow tests (!!v[i]) remain to be done at runtime.  For  \
+           small values of N, it would be better to do this as an       \
+           uncondtional sequence of adc.  An experiment/optimization    \
+           for another day...                                           \
+           N.B.  The weird subscripting: v[_N>3?3:0] is to silence      \
+           a spurious error from icpc                                   \
+           */                                                           \
+        ++v[_N>1?1:0];                                                  \
+        if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \
+        ++v[_N>2?2:0];                                                  \
+        if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this;  \
+        ++v[_N>3?3:0];                                                  \
+        for(size_t i=4; i<_N; ++i){                                     \
+            if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this;        \
+            ++v[i];                                                     \
+        }                                                               \
+        return *this;                                                   \
+    }                                                                   \
+    /* seed(SeedSeq) would be a constructor if having a constructor */  \
+    /* didn't cause headaches with defaults */                          \
+    template <typename SeedSeq>                                         \
+    R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){      \
+        r123array##_N##x##W ret;                                        \
+        const size_t Ngen = _N*((3+sizeof(value_type))/4);              \
+        uint32_t u32[Ngen];                                             \
+        uint32_t *p32 = &u32[0];                                        \
+        ss.generate(&u32[0], &u32[Ngen]);                               \
+        for(size_t i=0; i<_N; ++i){                                     \
+            ret.v[i] = assemble_from_u32<value_type>(p32);              \
+            p32 += (3+sizeof(value_type))/4;                            \
+        }                                                               \
+        return ret;                                                     \
+    }                                                                   \
+protected:                                                              \
+    R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \
+        /* n may be greater than the maximum value of a single value_type */ \
+        value_type vtn;                                                 \
+        vtn = n;                                                        \
+        v[0] += n;                                                      \
+        const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \
+        for(size_t i=1; i<_N; ++i){                                     \
+            if(rshift){                                                 \
+                n >>= rshift;                                           \
+            }else{                                                      \
+                n=0;                                                    \
+            }                                                           \
+            if( v[i-1] < vtn )                                          \
+                ++n;                                                    \
+            if( n==0 ) break;                                           \
+            vtn = n;                                                    \
+            v[i] += n;                                                  \
+        }                                                               \
+        return *this;                                                   \
+    }                                                                   \
+    
+                                                                        
+// There are several tricky considerations for the insertion and extraction
+// operators:
+// - we would like to be able to print r123array16x8 as a sequence of 16 integers,
+//   not as 16 bytes.
+// - we would like to be able to print r123array1xm128i.
+// - we do not want an int conversion operator in r123m128i because it causes
+//   lots of ambiguity problems with automatic promotions.
+// Solution: r123arrayinsertable and r123arrayextractable
+
+template<typename T>
+struct r123arrayinsertable{
+    const T& v;
+    r123arrayinsertable(const T& t_) : v(t_) {} 
+    friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<T>& t){
+        return os << t.v;
+    }
+};
+
+template<>
+struct r123arrayinsertable<uint8_t>{
+    const uint8_t& v;
+    r123arrayinsertable(const uint8_t& t_) : v(t_) {} 
+    friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<uint8_t>& t){
+        return os << (int)t.v;
+    }
+};
+
+template<typename T>
+struct r123arrayextractable{
+    T& v;
+    r123arrayextractable(T& t_) : v(t_) {}
+    friend std::istream& operator>>(std::istream& is, r123arrayextractable<T>& t){
+        return is >> t.v;
+    }
+};
+
+template<>
+struct r123arrayextractable<uint8_t>{
+    uint8_t& v;
+    r123arrayextractable(uint8_t& t_) : v(t_) {} 
+    friend std::istream& operator>>(std::istream& is, r123arrayextractable<uint8_t>& t){
+        int i;
+        is >>  i;
+        t.v = i;
+        return is;
+    }
+};
+
+#define CXXOVERLOADS(_N, W, T)                                          \
+                                                                        \
+inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){   \
+    os << r123arrayinsertable<T>(a.v[0]);                                  \
+    for(size_t i=1; i<_N; ++i)                                          \
+        os << " " << r123arrayinsertable<T>(a.v[i]);                       \
+    return os;                                                          \
+}                                                                       \
+                                                                        \
+inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){         \
+    for(size_t i=0; i<_N; ++i){                                         \
+        r123arrayextractable<T> x(a.v[i]);                                 \
+        is >> x;                                                        \
+    }                                                                   \
+    return is;                                                          \
+}                                                                       \
+                                                                        \
+namespace r123{                                                        \
+ typedef r123array##_N##x##W Array##_N##x##W;                          \
+}
+                                                                        
+#endif /* __cplusplus */
+
+/* _r123array_tpl expands to a declaration of struct r123arrayNxW.  
+
+   In C, it's nothing more than a struct containing an array of N
+   objects of type T.
+
+   In C++ it's the same, but endowed with an assortment of member
+   functions, typedefs and friends.  In C++, r123arrayNxW looks a lot
+   like std::array<T,N>, has most of the capabilities of a container,
+   and satisfies the requirements outlined in compat/Engine.hpp for
+   counter and key types.  ArrayNxW, in the r123 namespace is
+   a typedef equivalent to r123arrayNxW.
+*/
+
+#define _r123array_tpl(_N, W, T)                   \
+    /** @ingroup arrayNxW */                        \
+    /** @see arrayNxW */                            \
+struct r123array##_N##x##W{                         \
+ T v[_N];                                       \
+ CXXMETHODS(_N, W, T)                           \
+};                                              \
+                                                \
+CXXOVERLOADS(_N, W, T)
+
+/** @endcond */
+
+_r123array_tpl(1, 32, uint32_t)  /* r123array1x32 */
+_r123array_tpl(2, 32, uint32_t)  /* r123array2x32 */
+_r123array_tpl(4, 32, uint32_t)  /* r123array4x32 */
+_r123array_tpl(8, 32, uint32_t)  /* r123array8x32 */
+
+_r123array_tpl(1, 64, uint64_t)  /* r123array1x64 */
+_r123array_tpl(2, 64, uint64_t)  /* r123array2x64 */
+_r123array_tpl(4, 64, uint64_t)  /* r123array4x64 */
+
+_r123array_tpl(16, 8, uint8_t)  /* r123array16x8 for ARSsw, AESsw */
+
+#if R123_USE_SSE
+_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */
+#endif
+
+/* In C++, it's natural to use sizeof(a::value_type), but in C it's
+   pretty convoluted to figure out the width of the value_type of an
+   r123arrayNxW:
+*/
+#define R123_W(a)   (8*sizeof(((a *)0)->v[0]))
+
+/** @namespace r123
+  Most of the Random123 C++ API is contained in the r123 namespace. 
+*/
+
+#endif
+
diff --git a/src/external/Random123-1.08/include/Random123/features/clangfeatures.h b/src/external/Random123-1.08/include/Random123/features/clangfeatures.h
new file mode 100644
index 0000000000..908aee8b0b
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/features/clangfeatures.h
@@ -0,0 +1,76 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __clangfeatures_dot_hpp
+#define __clangfeatures_dot_hpp
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H ((defined(__x86_64__)||defined(__i386__)))
+#endif
+
+#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
+#define R123_USE_CXX11_UNRESTRICTED_UNIONS __has_feature(cxx_unrestricted_unions)
+#endif
+
+#ifndef R123_USE_CXX11_STATIC_ASSERT
+#define R123_USE_CXX11_STATIC_ASSERT __has_feature(cxx_static_assert)
+#endif
+
+#ifndef R123_USE_CXX11_CONSTEXPR
+#define R123_USE_CXX11_CONSTEXPR __has_feature(cxx_constexpr)
+#endif
+
+#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
+#define R123_USE_CXX11_EXPLICIT_CONVERSIONS __has_feature(cxx_explicit_conversions)
+#endif
+
+// With clang-3.0, the apparently simpler:
+//  #define R123_USE_CXX11_RANDOM __has_include(<random>)
+// dumps core.
+#ifndef R123_USE_CXX11_RANDOM
+#if __cplusplus>=201103L && __has_include(<random>)
+#define R123_USE_CXX11_RANDOM 1
+#else
+#define R123_USE_CXX11_RANDOM 0
+#endif
+#endif
+
+#ifndef R123_USE_CXX11_TYPE_TRAITS
+#if __cplusplus>=201103L && __has_include(<type_traits>)
+#define R123_USE_CXX11_TYPE_TRAITS 1
+#else
+#define R123_USE_CXX11_TYPE_TRAITS 0
+#endif
+#endif
+
+#include "gccfeatures.h"
+
+#endif
diff --git a/src/external/Random123-1.08/include/Random123/features/compilerfeatures.h b/src/external/Random123-1.08/include/Random123/features/compilerfeatures.h
new file mode 100644
index 0000000000..4039790823
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/features/compilerfeatures.h
@@ -0,0 +1,320 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/**
+
+@page porting Preprocessor symbols for porting Random123 to different platforms.
+
+The Random123 library is portable across C, C++, CUDA, OpenCL environments,
+and multiple operating systems (Linux, Windows 7, Mac OS X, FreeBSD, Solaris).
+This level of portability requires the abstraction of some features
+and idioms that are either not standardized (e.g., asm statments), or for which 
+different vendors have their own standards (e.g., SSE intrinsics) or for
+which vendors simply refuse to conform to well-established standards (e.g., <inttypes.h>).
+
+Random123/features/compilerfeatures.h
+conditionally includes a compiler-or-OS-specific Random123/featires/XXXfeatures.h file which
+defines appropriate values for the preprocessor symbols which can be used with
+a specific compiler or OS.  Those symbols will then
+be used by other header files and source files in the Random123
+library (and may be used by applications) to control what actually
+gets presented to the compiler.
+
+Most of the symbols are boolean valued.  In general, they will
+\b always be defined with value either 1 or 0, so do
+\b NOT use \#ifdef.  Use \#if R123_USE_SOMETHING instead.
+
+Library users can override any value by defining the pp-symbol with a compiler option,
+e.g.,
+
+    cc -DR123_USE_MULHILO64_C99 
+
+will use a strictly c99 version of the full-width 64x64->128-bit multiplication
+function, even if it would be disabled by default.
+
+All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.h start with the prefix R123_USE_
+@verbatim
+         AES_NI
+         AES_OPENSSL
+         SSE4_2
+         SSE4_1
+         SSE
+
+         STD_RANDOM
+
+         GNU_UINT128
+         ASM_GNU
+         ASM_MSASM
+
+         CPUID_MSVC
+
+         CXX11_RANDOM
+         CXX11_TYPE_TRAITS
+         CXX11_STATIC_ASSERT
+         CXX11_CONSTEXPR
+         CXX11_UNRESTRICTED_UNIONS
+         CXX11_EXPLICIT_CONVERSIONS
+         CXX11_LONG_LONG
+         CXX11 
+   
+         X86INTRIN_H
+         IA32INTRIN_H
+         XMMINTRIN_H
+         EMMINTRIN_H
+         SMMINTRIN_H
+         WMMINTRIN_H
+         INTRIN_H
+
+         MULHILO32_ASM
+         MULHILO64_ASM
+         MULHILO64_MSVC_INTRIN
+         MULHILO64_CUDA_INTRIN
+         MULHILO64_OPENCL_INTRIN
+         MULHILO64_C99
+
+         U01_DOUBLE
+	 
+@endverbatim
+Most have obvious meanings.  Some non-obvious ones:
+
+AES_NI and AES_OPENSSL are not mutually exclusive.  You can have one,
+both or neither.
+
+GNU_UINT128 says that it's safe to use __uint128_t, but it
+does not require its use.  In particular, it should be
+used in mulhilo<uint64_t> only if MULHILO64_ASM is unset.
+
+If the XXXINTRIN_H macros are true, then one should
+@code
+#include <xxxintrin.h>
+@endcode
+to gain accesss to compiler intrinsics.
+
+The CXX11_SOME_FEATURE macros allow the code to use specific
+features of the C++11 language and library.  The catchall
+In the absence of a specific CXX11_SOME_FEATURE, the feature
+is controlled by the catch-all R123_USE_CXX11 macro.
+
+U01_DOUBLE defaults on, and can be turned off (set to 0)
+if one does not want the utility functions that convert to double
+(i.e. u01_*_53()), e.g. on OpenCL without the cl_khr_fp64 extension.
+
+There are a number of invariants that are always true.  Application code may
+choose to rely on these:
+
+<ul>
+<li>ASM_GNU and ASM_MASM are mutually exclusive
+<li>The "higher" SSE values imply the lower ones.
+</ul>
+
+There are also non-boolean valued symbols:
+
+<ul>
+<li>R123_STATIC_INLINE -
+  According to both C99 and GNU99, the 'static inline' declaration allows
+  the compiler to not emit code if the function is not used.  
+  Note that the semantics of 'inline', 'static' and 'extern' in
+  gcc have changed over time and are subject to modification by
+  command line options, e.g., -std=gnu89, -fgnu-inline.
+  Nevertheless, it appears that the meaning of 'static inline' 
+  has not changed over time and (with a little luck) the use of 'static inline'
+  here will be portable between versions of gcc and to other C99
+  compilers.
+  See: http://gcc.gnu.org/onlinedocs/gcc/Inline.html
+       http://www.greenend.org.uk/rjk/2003/03/inline.html
+
+<li>R123_FORCE_INLINE(decl) -
+  which expands to 'decl', adorned with the compiler-specific
+  embellishments to strongly encourage that the declared function be
+  inlined.  If there is no such compiler-specific magic, it should
+  expand to decl, unadorned.
+   
+<li>R123_CUDA_DEVICE - which expands to __device__ (or something else with
+  sufficiently similar semantics) when CUDA is in use, and expands
+  to nothing in other cases.
+
+<li>R123_ASSERT(x) - which expands to assert(x), or maybe to nothing at
+  all if we're in an environment so feature-poor that you can't even
+  call assert (I'm looking at you, CUDA and OpenCL), or even include
+  assert.h safely (OpenCL).
+
+<li>R123_STATIC_ASSERT(expr,msg) - which expands to
+  static_assert(expr,msg), or to an expression that
+  will raise a compile-time exception if expr is not true.
+
+<li>R123_ULONG_LONG - which expands to a declaration of the longest available
+  unsigned integer.
+
+<li>R123_64BIT(x) - expands to something equivalent to
+  UINT64_C(x) from <stdint.h>, even in environments where <stdint.h>
+  is not available, e.g., MSVC and OpenCL.
+
+<li>R123_BUILTIN_EXPECT(expr,likely_value) - expands to something with
+  the semantics of gcc's __builtin_expect(expr,likely_value).  If
+  the environment has nothing like __builtin_expect, it should expand
+  to just expr.
+</ul>
+
+
+\cond HIDDEN_FROM_DOXYGEN
+*/
+
+/* 
+N.B.  When something is added to the list of features, it should be
+added to each of the *features.h files, AND to examples/ut_features.cpp.
+*/
+
+/* N.B.  most other compilers (icc, nvcc, open64, llvm) will also define __GNUC__, so order matters. */
+#if defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0
+#include "openclfeatures.h"
+#elif defined(__CUDACC__)
+#include "nvccfeatures.h"
+#elif defined(__ICC)
+#include "iccfeatures.h"
+#elif defined(__xlC__)
+#include "xlcfeatures.h"
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+#include "sunprofeatures.h"
+#elif defined(__OPEN64__)
+#include "open64features.h"
+#elif defined(__clang__)
+#include "clangfeatures.h"
+#elif defined(__GNUC__)
+#include "gccfeatures.h"
+#elif defined(__PGI)
+#include "pgccfeatures.h"
+#elif defined(_MSC_FULL_VER)
+#include "msvcfeatures.h"
+#else
+#error "Can't identify compiler.  You'll need to add a new xxfeatures.hpp"
+{ /* maybe an unbalanced brace will terminate the compilation */
+#endif
+
+#ifndef R123_USE_CXX11
+#define R123_USE_CXX11 (__cplusplus >= 201103L)
+#endif
+
+#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
+#define R123_USE_CXX11_UNRESTRICTED_UNIONS R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_STATIC_ASSERT
+#define R123_USE_CXX11_STATIC_ASSERT R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_CONSTEXPR
+#define R123_USE_CXX11_CONSTEXPR R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
+#define R123_USE_CXX11_EXPLICIT_CONVERSIONS R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_RANDOM
+#define R123_USE_CXX11_RANDOM R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_TYPE_TRAITS
+#define R123_USE_CXX11_TYPE_TRAITS R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_CXX11_LONG_LONG
+#define R123_USE_CXX11_LONG_LONG R123_USE_CXX11
+#endif
+
+#ifndef R123_USE_MULHILO64_C99
+#define R123_USE_MULHILO64_C99 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MULHI_INTRIN
+#define R123_USE_MULHILO64_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO32_MULHI_INTRIN
+#define R123_USE_MULHILO32_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_STATIC_ASSERT
+#if R123_USE_CXX11_STATIC_ASSERT
+#define R123_STATIC_ASSERT(expr, msg) static_assert(expr, msg)
+#else
+    /* if msg always_looked_like_this, we could paste it into the name.  Worth it? */
+#define R123_STATIC_ASSERT(expr, msg) typedef char static_assertion[(!!(expr))*2-1]
+#endif
+#endif
+
+#ifndef R123_CONSTEXPR
+#if R123_USE_CXX11_CONSTEXPR
+#define R123_CONSTEXPR constexpr
+#else
+#define R123_CONSTEXPR
+#endif
+#endif
+
+#ifndef R123_USE_PHILOX_64BIT
+#define R123_USE_PHILOX_64BIT (R123_USE_MULHILO64_ASM || R123_USE_MULHILO64_MSVC_INTRIN || R123_USE_MULHILO64_CUDA_INTRIN || R123_USE_GNU_UINT128 || R123_USE_MULHILO64_C99 || R123_USE_MULHILO64_OPENCL_INTRIN || R123_USE_MULHILO64_MULHI_INTRIN)
+#endif
+
+#ifndef R123_ULONG_LONG
+#if defined(__cplusplus) && !R123_USE_CXX11_LONG_LONG
+/* C++98 doesn't have long long.  It doesn't have uint64_t either, but
+   we will have typedef'ed uint64_t to something in the xxxfeatures.h.
+   With luck, it won't elicit complaints from -pedantic.  Cross your
+   fingers... */
+#define R123_ULONG_LONG uint64_t
+#else
+#define R123_ULONG_LONG unsigned long long
+#endif
+#endif
+
+/* UINT64_C should have been #defined by XXXfeatures.h, either by
+   #include <stdint.h> or through compiler-dependent hacks */
+#ifndef R123_64BIT
+#define R123_64BIT(x) UINT64_C(x)
+#endif
+
+#ifndef R123_THROW
+#define R123_THROW(x)    throw (x)
+#endif
+
+/*
+ * Windows.h (and perhaps other "well-meaning" code define min and
+ * max, so there's a high chance that our definition of min, max
+ * methods or use of std::numeric_limits min and max will cause
+ * complaints in any program that happened to include Windows.h or
+ * suchlike first.  We use the null macro below in our own header
+ * files definition or use of min, max to defensively preclude
+ * this problem.  It may not be enough; one might need to #define
+ * NOMINMAX before including Windows.h or compile with -DNOMINMAX.
+ */
+#define R123_NO_MACRO_SUBST
+
+/** \endcond */
diff --git a/src/external/Random123-1.08/include/Random123/features/gccfeatures.h b/src/external/Random123-1.08/include/Random123/features/gccfeatures.h
new file mode 100644
index 0000000000..d6bb06088d
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/features/gccfeatures.h
@@ -0,0 +1,247 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __gccfeatures_dot_hpp
+#define __gccfeatures_dot_hpp
+
+#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+
+#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__)
+#  error "This code has only been tested on x86 and powerpc platforms."
+#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
+{ /* maybe an unbalanced brace will terminate the compilation */
+ /* Feel free to try the Random123 library on other architectures by changing
+ the conditions that reach this error, but you should consider it a
+ porting exercise and expect to encounter bugs and deficiencies.
+ Please let the authors know of any successes (or failures). */
+#endif
+
+#ifdef __powerpc__
+#include <ppu_intrinsics.h>
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static __inline__
+#endif
+
+#ifndef R123_FORCE_INLINE
+#if R123_GNUC_VERSION >= 40000
+#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
+#else
+#define R123_FORCE_INLINE(decl) decl
+#endif
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
+#endif
+
+/* According to the C++0x standard, we should be able to test the numeric
+   value of __cplusplus == 199701L for C++98, __cplusplus == 201103L for C++0x
+   But gcc has had an open bug  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=1773
+   since early 2001, which was finally fixed in 4.7 (early 2012).  For
+   earlier versions, the only way  to detect whether --std=c++0x was requested
+   on the command line is to look at the __GCC_EXPERIMENTAL_CXX0X__ pp-symbol.
+*/
+#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && defined(__GCC_EXPERIMENTAL_CXX0X__) ))
+
+#ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
+#define R123_USE_CXX11_UNRESTRICTED_UNIONS ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_STATIC_ASSERT
+#define R123_USE_CXX11_STATIC_ASSERT ((R123_GNUC_VERSION >= 40300) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_CONSTEXPR
+#define R123_USE_CXX11_CONSTEXPR ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_EXPLICIT_CONVERSIONS
+#define R123_USE_CXX11_EXPLICIT_CONVERSIONS ((R123_GNUC_VERSION >= 40500) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_RANDOM
+#define R123_USE_CXX11_RANDOM ((R123_GNUC_VERSION>=40500) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_CXX11_TYPE_TRAITS
+#define R123_USE_CXX11_TYPE_TRAITS ((R123_GNUC_VERSION>=40400) && GNU_CXX11)
+#endif
+
+#ifndef R123_USE_AES_NI
+#ifdef __AES__
+#define R123_USE_AES_NI 1
+#else
+#define R123_USE_AES_NI 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_2
+#ifdef __SSE4_2__
+#define R123_USE_SSE4_2 1
+#else
+#define R123_USE_SSE4_2 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_1
+#ifdef __SSE4_1__
+#define R123_USE_SSE4_1 1
+#else
+#define R123_USE_SSE4_1 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE
+/* There's no point in trying to compile SSE code in Random123
+   unless SSE2 is available. */
+#ifdef __SSE2__
+#define R123_USE_SSE 1
+#else
+#define R123_USE_SSE 0
+#endif
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+/* There isn't really a good way to tell at compile time whether
+   openssl is available.  Without a pre-compilation configure-like
+   tool, it's less error-prone to guess that it isn't available.  Add
+   -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
+   play with openssl */
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#ifdef __x86_64__
+#define R123_USE_GNU_UINT128 1
+#else
+#define R123_USE_GNU_UINT128 0
+#endif
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU (defined(__x86_64__)||defined(__i386__))
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H ((defined(__x86_64__)||defined(__i386__)) && R123_GNUC_VERSION >= 40402)
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+/* gcc -m64 on Solaris 10 defines __SSE2__ but doesn't have 
+   emmintrin.h in the include search path.  This is
+   so broken that I refuse to try to work around it.  If this
+   affects you, figure out where your emmintrin.h lives and
+   add an appropriate -I to your CPPFLAGS.  Or add -DR123_USE_SSE=0. */
+#define R123_USE_EMMINTRIN_H (R123_USE_SSE && (R123_GNUC_VERSION < 40402))
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H ((R123_USE_SSE4_1 || R123_USE_SSE4_2) && (R123_GNUC_VERSION < 40402))
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#define R123_USE_INTRIN_H 0
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MULHI_INTRIN
+#define R123_USE_MULHILO64_MULHI_INTRIN (defined(__powerpc64__))
+#endif
+
+#ifndef R123_MULHILO64_MULHI_INTRIN
+#define R123_MULHILO64_MULHI_INTRIN __mulhdu
+#endif
+
+#ifndef R123_USE_MULHILO32_MULHI_INTRIN
+#define R123_USE_MULHILO32_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_MULHILO32_MULHI_INTRIN
+#define R123_MULHILO32_MULHI_INTRIN __mulhwu
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+/* If you add something, it must go in all the other XXfeatures.hpp
+   and in ../ut_features.cpp */
+#endif
diff --git a/src/external/Random123-1.08/include/Random123/features/iccfeatures.h b/src/external/Random123-1.08/include/Random123/features/iccfeatures.h
new file mode 100644
index 0000000000..b64e5c2299
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/features/iccfeatures.h
@@ -0,0 +1,208 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __icpcfeatures_dot_hpp
+#define __icpcfeatures_dot_hpp
+
+// icc relies on gcc libraries and other toolchain components.
+#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+
+#if !defined(__x86_64__) && !defined(__i386__)
+#  error "This code has only been tested on x86 platforms."
+{ // maybe an unbalanced brace will terminate the compilation
+// You are invited to try Easy123 on other architectures, by changing
+// the conditions that reach this error, but you should consider it a
+// porting exercise and expect to encounter bugs and deficiencies.
+// Please let the authors know of any successes (or failures).
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
+#endif
+
+// The basic idiom is:
+// #ifndef R123_SOMETHING
+// #if some condition
+// #define R123_SOMETHING 1
+// #else
+// #define R123_SOMETHING 0
+// #endif
+// #endif
+// This idiom allows an external user to override any decision
+// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
+
+// An alternative idiom is:
+// #ifndef R123_SOMETHING
+// #define R123_SOMETHING (some boolean expression)
+// #endif
+// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
+// pp-symbols.
+
+#ifndef R123_USE_SSE4_2
+#ifdef __SSE4_2__
+#define R123_USE_SSE4_2 1
+#else
+#define R123_USE_SSE4_2 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_1
+#ifdef __SSE4_1__
+#define R123_USE_SSE4_1 1
+#else
+#define R123_USE_SSE4_1 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE
+#ifdef __SSE2__
+#define R123_USE_SSE 1
+#else
+#define R123_USE_SSE 0
+#endif
+#endif
+
+#ifndef R123_USE_AES_NI
+// Unlike gcc, icc (version 12) does not pre-define an __AES__
+// pp-symbol when -maes or -xHost is on the command line.  This feels
+// like a defect in icc (it defines __SSE4_2__ in analogous
+// circumstances), but until Intel fixes it, we're better off erring
+// on the side of caution and not generating instructions that are
+// going to raise SIGILL when executed.  To get the AES-NI
+// instructions with icc, the caller must puts something like
+// -DR123_USE_AES_NI=1 or -D__AES__ on the command line.  FWIW, the
+// AES-NI Whitepaper by Gueron says that icc has supported AES-NI from
+// 11.1 onwards.
+//
+#define R123_USE_AES_NI ((__ICC>=1101) && defined(__AES__))
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+/* There isn't really a good way to tell at compile time whether
+   openssl is available.  Without a pre-compilation configure-like
+   tool, it's less error-prone to guess that it isn't available.  Add
+   -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
+   play with openssl */
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 1
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 1
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#define R123_USE_INTRIN_H 0
+#endif
+
+#ifndef R123_USE_MULHILO16_ASM
+#define R123_USE_MULHILO16_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 1
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+// If you add something, it must go in all the other XXfeatures.hpp
+// and in ../ut_features.cpp
+#endif
diff --git a/src/external/Random123-1.08/include/Random123/features/msvcfeatures.h b/src/external/Random123-1.08/include/Random123/features/msvcfeatures.h
new file mode 100644
index 0000000000..9eb9520912
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/features/msvcfeatures.h
@@ -0,0 +1,200 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __msvcfeatures_dot_hpp
+#define __msvcfeatures_dot_hpp
+
+//#if _MSVC_FULL_VER <= 15
+//#error "We've only tested MSVC_FULL_VER==15."
+//#endif
+
+#if !defined(_M_IX86) && !defined(_M_X64)
+#  error "This code has only been tested on x86 platforms."
+{ // maybe an unbalanced brace will terminate the compilation
+// You are invited to try Random123 on other architectures, by changing
+// the conditions that reach this error, but you should consider it a
+// porting exercise and expect to encounter bugs and deficiencies.
+// Please let the authors know of any successes (or failures).
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static __inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) _forceinline decl
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+// The basic idiom is:
+// #ifndef R123_SOMETHING
+// #if some condition
+// #define R123_SOMETHING 1
+// #else
+// #define R123_SOMETHING 0
+// #endif
+// #endif
+// This idiom allows an external user to override any decision
+// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
+
+// An alternative idiom is:
+// #ifndef R123_SOMETHING
+// #define R123_SOMETHING (some boolean expression)
+// #endif
+// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
+// pp-symbols.
+
+#ifndef R123_USE_AES_NI
+#if defined(_M_X64)
+#define R123_USE_AES_NI 1
+#else
+#define R123_USE_AES_NI 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_2
+#if defined(_M_X64)
+#define R123_USE_SSE4_2 1
+#else
+#define R123_USE_SSE4_2 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE4_1
+#if defined(_M_X64)
+#define R123_USE_SSE4_1 1
+#else
+#define R123_USE_SSE4_1 0
+#endif
+#endif
+
+#ifndef R123_USE_SSE
+#define R123_USE_SSE 1
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 0
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 1
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#define R123_USE_INTRIN_H 1
+#endif
+
+#ifndef R123_USE_MULHILO16_ASM
+#define R123_USE_MULHILO16_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#if defined(_M_X64)
+#define R123_USE_MULHILO64_MSVC_INTRIN 1
+#else
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+#pragma warning(disable:4244)
+#pragma warning(disable:4996)
+
+// If you add something, it must go in all the other XXfeatures.hpp
+// and in ../ut_features.cpp
+#endif
diff --git a/src/external/Random123-1.08/include/Random123/features/nvccfeatures.h b/src/external/Random123-1.08/include/Random123/features/nvccfeatures.h
new file mode 100644
index 0000000000..711babf88e
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/features/nvccfeatures.h
@@ -0,0 +1,110 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __r123_nvcc_features_dot_h__
+#define __r123_nvcc_features_dot_h__
+
+#if !defined(CUDART_VERSION)
+#error "why are we in nvccfeatures.h if CUDART_VERSION is not defined"
+#endif
+
+#if CUDART_VERSION < 4010
+#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces.  Random123 isunsupported.  See comments in nvccfeatures.h"
+// This test was added in Random123-1.08 (August, 2013) because we
+// discovered that Ftype(maxTvalue<T>()) with Ftype=double and
+// T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and
+// earlier.  We can't be sure this bug doesn't also affect invocations
+// of other templated functions, e.g., essentially all of Random123.
+// Thus, we no longer trust CUDA versions earlier than 4.1 even though
+// we had previously tested and timed Random123 with CUDA 3.x and 4.0.
+// If you feel lucky or desperate, you can change #error to #warning, but
+// please take extra care to be sure that you are getting correct
+// results.
+#endif
+
+// nvcc falls through to gcc or msvc.  So first define
+// a couple of things and then include either gccfeatures.h
+// or msvcfeatures.h
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE __device__
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 1
+#endif
+
+#ifndef R123_ASSERT
+#define R123_ASSERT(x) if((x)) ; else asm("trap;")
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+#ifndef R123_USE_SSE4_2
+#define R123_USE_SSE4_2 0
+#endif
+
+#ifndef R123_USE_SSE4_1
+#define R123_USE_SSE4_1 0
+#endif
+
+#ifndef R123_USE_SSE
+#define R123_USE_SSE 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_ULONG_LONG
+// uint64_t, which is what we'd get without this, is
+// not the same as unsigned long long
+#define R123_ULONG_LONG unsigned long long
+#endif
+
+#ifndef R123_THROW
+// No exceptions in CUDA, at least upto 4.0
+#define R123_THROW(x)    R123_ASSERT(0)
+#endif
+
+#if defined(__GNUC__)
+#include "gccfeatures.h"
+#elif defined(_MSC_FULL_VER)
+#include "msvcfeatures.h"
+#endif
+
+#endif
diff --git a/src/external/Random123-1.08/include/Random123/features/open64features.h b/src/external/Random123-1.08/include/Random123/features/open64features.h
new file mode 100644
index 0000000000..8da9f5f51e
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/features/open64features.h
@@ -0,0 +1,50 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __open64features_dot_hpp
+#define __open64features_dot_hpp
+
+/* The gcc features are mostly right.  We just override a few and then include gccfeatures.h */
+
+/* Open64 4.2.3 and 4.2.4 accept the __uint128_t code without complaint
+   but produce incorrect code for 64-bit philox.  The MULHILO64_ASM
+   seems to work fine */
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 1
+#endif
+
+#include "gccfeatures.h"
+
+#endif
diff --git a/src/external/Random123-1.08/include/Random123/features/openclfeatures.h b/src/external/Random123-1.08/include/Random123/features/openclfeatures.h
new file mode 100644
index 0000000000..af03d30923
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/features/openclfeatures.h
@@ -0,0 +1,89 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __openclfeatures_dot_hpp
+#define __openclfeatures_dot_hpp
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#define R123_ASSERT(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 1
+#endif
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+// XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of
+// ulong to mul_hi.  And gets lots of complaints from stdint.h
+// on some machines.
+// But these typedefs mean we cannot include stdint.h with
+// these headers?  Do we need R123_64T, R123_32T, R123_8T?
+typedef ulong uint64_t;
+typedef uint  uint32_t;
+typedef uchar uint8_t;
+#define UINT64_C(x) ((ulong)(x##UL))
+
+#endif
diff --git a/src/external/Random123-1.08/include/Random123/features/pgccfeatures.h b/src/external/Random123-1.08/include/Random123/features/pgccfeatures.h
new file mode 100644
index 0000000000..18ace1353b
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/features/pgccfeatures.h
@@ -0,0 +1,194 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Copyright (c) 2013, Los Alamos National Security, LLC
+All rights reserved.
+
+Copyright 2013. Los Alamos National Security, LLC. This software was produced
+under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
+the U.S. Department of Energy. The U.S. Government has rights to use,
+reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS
+ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+to produce derivative works, such modified software should be clearly marked,
+so as not to confuse it with the version available from LANL.
+*/
+#ifndef __pgccfeatures_dot_hpp
+#define __pgccfeatures_dot_hpp
+
+#if !defined(__x86_64__) && !defined(__i386__)
+#  error "This code has only been tested on x86 platforms."
+#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
+{ /* maybe an unbalanced brace will terminate the compilation */
+ /* Feel free to try the Random123 library on other architectures by changing
+ the conditions that reach this error, but you should consider it a
+ porting exercise and expect to encounter bugs and deficiencies.
+ Please let the authors know of any successes (or failures). */
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static inline
+#endif
+
+/* Found this example in PGI's emmintrin.h. */
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) (expr)
+#endif
+
+/* PGI through 13.2 doesn't appear to support AES-NI. */
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+/* PGI through 13.2 appears to support MMX, SSE, SSE3, SSE3, SSSE3, SSE4a, and
+   ABM, but not SSE4.1 or SSE4.2. */
+#ifndef R123_USE_SSE4_2
+#define R123_USE_SSE4_2 0
+#endif
+
+#ifndef R123_USE_SSE4_1
+#define R123_USE_SSE4_1 0
+#endif
+
+#ifndef R123_USE_SSE
+/* There's no point in trying to compile SSE code in Random123
+   unless SSE2 is available. */
+#ifdef __SSE2__
+#define R123_USE_SSE 1
+#else
+#define R123_USE_SSE 0
+#endif
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+/* There isn't really a good way to tell at compile time whether
+   openssl is available.  Without a pre-compilation configure-like
+   tool, it's less error-prone to guess that it isn't available.  Add
+   -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
+   play with openssl */
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 1
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+/* emmintrin.h from PGI #includes xmmintrin.h but then complains at link time
+   about undefined references to _mm_castsi128_ps(__m128i).  Why? */
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 1
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#ifdef __ABM__
+#define R123_USE_INTRIN_H 1
+#else
+#define R123_USE_INTRIN_H 0
+#endif
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MULHI_INTRIN
+#define R123_USE_MULHILO64_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 1
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+/* If you add something, it must go in all the other XXfeatures.hpp
+   and in ../ut_features.cpp */
+#endif
diff --git a/src/external/Random123-1.08/include/Random123/features/sse.h b/src/external/Random123-1.08/include/Random123/features/sse.h
new file mode 100644
index 0000000000..88efd65f10
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/features/sse.h
@@ -0,0 +1,280 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _Random123_sse_dot_h__
+#define _Random123_sse_dot_h__
+
+#if R123_USE_SSE
+
+#if R123_USE_X86INTRIN_H
+#include <x86intrin.h>
+#endif
+#if R123_USE_IA32INTRIN_H
+#include <ia32intrin.h>
+#endif
+#if R123_USE_XMMINTRIN_H
+#include <xmmintrin.h>
+#endif
+#if R123_USE_EMMINTRIN_H
+#include <emmintrin.h>
+#endif
+#if R123_USE_SMMINTRIN_H
+#include <smmintrin.h>
+#endif
+#if R123_USE_WMMINTRIN_H
+#include <wmmintrin.h>
+#endif
+#if R123_USE_INTRIN_H
+#include <intrin.h>
+#endif
+#ifdef __cplusplus
+#include <iostream>
+#include <limits>
+#include <stdexcept>
+#endif
+
+#if R123_USE_ASM_GNU
+
+/* bit25 of CX tells us whether AES is enabled. */
+R123_STATIC_INLINE int haveAESNI(){
+    unsigned int eax, ebx, ecx, edx;
+    __asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
+                      "a" (1));
+    return (ecx>>25) & 1;
+}
+#elif R123_USE_CPUID_MSVC
+R123_STATIC_INLINE int haveAESNI(){
+    int CPUInfo[4];
+    __cpuid(CPUInfo, 1);
+    return (CPUInfo[2]>>25)&1;
+}
+#else /* R123_USE_CPUID_??? */
+#warning "No R123_USE_CPUID_XXX method chosen.  haveAESNI will always return false"
+R123_STATIC_INLINE int haveAESNI(){
+    return 0;
+}
+#endif /* R123_USE_ASM_GNU || R123_USE_CPUID_MSVC */
+
+// There is a lot of annoying and inexplicable variation in the
+// SSE intrinsics available in different compilation environments.
+// The details seem to depend on the compiler, the version and
+// the target architecture.  Rather than insisting on
+// R123_USE_feature tests for each of these in each of the
+// compilerfeatures.h files we just keep the complexity localized
+// to here...
+#if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
+/* Is there an intrinsic to assemble an __m128i from two 64-bit words? 
+   If not, use the 4x32-bit intrisic instead.  N.B.  It looks like Intel
+   added _mm_set_epi64x to icc version 12.1 in Jan 2012.
+*/
+R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
+    union{
+        uint64_t u64;
+        uint32_t u32[2];
+    } u1, u0;
+    u1.u64 = v1;
+    u0.u64 = v0;
+    return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
+}
+#endif
+/* _mm_extract_lo64 abstracts the task of extracting the low 64-bit
+   word from an __m128i.  The _mm_cvtsi128_si64 intrinsic does the job
+   on 64-bit platforms.  Unfortunately, both MSVC and Open64 fail
+   assertions in ut_M128.cpp and ut_carray.cpp when we use the
+   _mm_cvtsi128_si64 intrinsic.  (See
+   https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug).
+   On 32-bit platforms, there's no MOVQ, so there's no intrinsic.
+   Finally, even if the intrinsic exists, it may be spelled with or
+   without the 'x'.
+*/
+#if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
+R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
+    union{
+        uint64_t u64[2];
+        __m128i m;
+    }u;
+    _mm_store_si128(&u.m, si);
+    return u.u64[0];
+}
+#elif defined(__llvm__) || defined(__ICC)
+R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
+    return (uint64_t)_mm_cvtsi128_si64(si);
+}
+#else /* GNUC, others */
+/* FWIW, gcc's emmintrin.h has had the 'x' spelling
+   since at least gcc-3.4.4.  The no-'x' spelling showed up
+   around 4.2. */
+R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
+    return (uint64_t)_mm_cvtsi128_si64x(si);
+}
+#endif
+#if defined(__GNUC__) && __GNUC__ < 4
+/* the cast builtins showed up in gcc4. */
+R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
+    return (__m128)si;
+}
+#endif
+
+#ifdef __cplusplus
+
+struct r123m128i{
+    __m128i m;
+#if R123_USE_CXX11_UNRESTRICTED_UNIONS
+    // C++98 forbids a union member from having *any* constructors.
+    // C++11 relaxes this, and allows union members to have constructors
+    // as long as there is a "trivial" default construtor.  So in C++11
+    // we can provide a r123m128i constructor with an __m128i argument, and still
+    // have the default (and hence trivial) default constructor.
+    r123m128i() = default;
+    r123m128i(__m128i _m): m(_m){}
+#endif
+    r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
+    r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
+#if R123_USE_CXX11_EXPLICIT_CONVERSIONS
+    // With C++0x we can attach explicit to the bool conversion operator
+    // to disambiguate undesired promotions.  For g++, this works
+    // only in 4.5 and above.
+    explicit operator bool() const {return _bool();}
+#else
+    // Pre-C++0x, we have to do something else.  Google for the "safe bool"
+    // idiom for other ideas...
+    operator const void*() const{return _bool()?this:0;}
+#endif
+    operator __m128i() const {return m;}
+
+private:
+#if R123_USE_SSE4_1
+    bool _bool() const{ return !_mm_testz_si128(m,m); }
+#else
+    bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
+#endif
+};
+
+R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
+    __m128i& c = v.m;
+    __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
+    c = _mm_add_epi64(c, zeroone);
+    //return c;
+#if R123_USE_SSE4_1
+    __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
+    if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
+        __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
+        c = _mm_add_epi64(c, onezero);
+    }
+#else
+    unsigned mask  = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
+    // The low two bits of mask are 11 iff the low 64 bits of
+    // c are zero.
+    if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
+        __m128i onezero = _mm_set_epi64x(1,0);
+        c = _mm_add_epi64(c, onezero);
+    }
+#endif
+    return v;
+}
+
+R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){ 
+    __m128i c = lhs.m;
+    __m128i incr128 = _mm_set_epi64x(0, n);
+    c = _mm_add_epi64(c, incr128);
+    // return c;     // NO CARRY!  
+
+    int64_t lo64 = _mm_extract_lo64(c);
+    if((uint64_t)lo64 < n)
+        c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
+    lhs.m = c;
+    return lhs; 
+}
+
+// We need this one because it's present, but never used in r123array1xm128i::incr
+R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){
+    throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
+
+// The comparisons aren't implemented, but if we leave them out, and 
+// somebody writes, e.g., M1 < M2, the compiler will do an implicit
+// conversion through void*.  Sigh...
+R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){
+    throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
+R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){
+    throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
+R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){
+    throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
+R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){
+    throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
+
+R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){ 
+    return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
+R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){ 
+    return !(lhs==rhs);}
+R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
+    r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
+R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
+    return !(lhs==rhs);}
+R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
+    union{
+        uint64_t u64[2];
+        __m128i m;
+    }u;
+    _mm_storeu_si128(&u.m, m.m);
+    return os << u.u64[0] << " " << u.u64[1];
+}
+
+R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
+    uint64_t u64[2];
+    is >> u64[0] >> u64[1];
+    m.m = _mm_set_epi64x(u64[1], u64[0]);
+    return is;
+}
+
+template<typename T> inline T assemble_from_u32(uint32_t *p32); // forward declaration
+
+template <>
+inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
+    r123m128i ret;
+    ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
+    return ret;
+}
+
+#else
+
+typedef struct {
+    __m128i m;
+} r123m128i;
+
+#endif /* __cplusplus */
+
+#else /* !R123_USE_SSE */
+R123_STATIC_INLINE int haveAESNI(){
+    return 0;
+}
+#endif /* R123_USE_SSE */
+
+#endif /* _Random123_sse_dot_h__ */
diff --git a/src/external/Random123-1.08/include/Random123/features/sunprofeatures.h b/src/external/Random123-1.08/include/Random123/features/sunprofeatures.h
new file mode 100644
index 0000000000..c9cdc00f5e
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/features/sunprofeatures.h
@@ -0,0 +1,172 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __sunprofeatures_dot_hpp
+#define __sunprofeatures_dot_hpp
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+// The basic idiom is:
+// #ifndef R123_SOMETHING
+// #if some condition
+// #define R123_SOMETHING 1
+// #else
+// #define R123_SOMETHING 0
+// #endif
+// #endif
+// This idiom allows an external user to override any decision
+// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0
+
+// An alternative idiom is:
+// #ifndef R123_SOMETHING
+// #define R123_SOMETHING (some boolean expression)
+// #endif
+// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE
+// pp-symbols.
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+#ifndef R123_USE_SSE4_2
+#define R123_USE_SSE4_2 0
+#endif
+
+#ifndef R123_USE_SSE4_1
+#define R123_USE_SSE4_1 0
+#endif
+
+#ifndef R123_USE_SSE
+#define R123_USE_SSE 0
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 0
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#define R123_USE_INTRIN_H 0
+#endif
+
+#ifndef R123_USE_MULHILO16_ASM
+#define R123_USE_MULHILO16_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef R123_USE_PHILOX_64BIT
+#define R123_USE_PHILOX_64BIT 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+// If you add something, it must go in all the other XXfeatures.hpp
+// and in ../ut_features.cpp
+#endif
diff --git a/src/external/Random123-1.08/include/Random123/features/xlcfeatures.h b/src/external/Random123-1.08/include/Random123/features/xlcfeatures.h
new file mode 100644
index 0000000000..a5c8412a44
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/features/xlcfeatures.h
@@ -0,0 +1,202 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Copyright (c) 2013, Los Alamos National Security, LLC
+All rights reserved.
+
+Copyright 2013. Los Alamos National Security, LLC. This software was produced
+under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
+the U.S. Department of Energy. The U.S. Government has rights to use,
+reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS
+ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+to produce derivative works, such modified software should be clearly marked,
+so as not to confuse it with the version available from LANL.
+*/
+#ifndef __xlcfeatures_dot_hpp
+#define __xlcfeatures_dot_hpp
+
+#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__)
+#  error "This code has only been tested on x86 and PowerPC platforms."
+#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
+{ /* maybe an unbalanced brace will terminate the compilation */
+ /* Feel free to try the Random123 library on other architectures by changing
+ the conditions that reach this error, but you should consider it a
+ porting exercise and expect to encounter bugs and deficiencies.
+ Please let the authors know of any successes (or failures). */
+#endif
+
+#ifdef __cplusplus
+/* builtins are automatically available to xlc.  To use them with xlc++,
+   one must include builtins.h.   c.f
+   http://publib.boulder.ibm.com/infocenter/cellcomp/v101v121/index.jsp?topic=/com.ibm.xlcpp101.cell.doc/compiler_ref/compiler_builtins.html
+*/
+#include <builtins.h>
+#endif
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
+#endif
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+#ifndef R123_USE_SSE4_2
+#define R123_USE_SSE4_2 0
+#endif
+
+#ifndef R123_USE_SSE4_1
+#define R123_USE_SSE4_1 0
+#endif
+
+#ifndef R123_USE_SSE
+#define R123_USE_SSE 0
+#endif
+
+#ifndef R123_USE_AES_OPENSSL
+/* There isn't really a good way to tell at compile time whether
+   openssl is available.  Without a pre-compilation configure-like
+   tool, it's less error-prone to guess that it isn't available.  Add
+   -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
+   play with openssl */
+#define R123_USE_AES_OPENSSL 0
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_ASM_GNU
+#define R123_USE_ASM_GNU 1
+#endif
+
+#ifndef R123_USE_CPUID_MSVC
+#define R123_USE_CPUID_MSVC 0
+#endif
+
+#ifndef R123_USE_X86INTRIN_H
+#define R123_USE_X86INTRIN_H 0
+#endif
+
+#ifndef R123_USE_IA32INTRIN_H
+#define R123_USE_IA32INTRIN_H 0
+#endif
+
+#ifndef R123_USE_XMMINTRIN_H
+#define R123_USE_XMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_EMMINTRIN_H
+#define R123_USE_EMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_SMMINTRIN_H
+#define R123_USE_SMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#ifdef __ABM__
+#define R123_USE_INTRIN_H 1
+#else
+#define R123_USE_INTRIN_H 0
+#endif
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MULHI_INTRIN
+#define R123_USE_MULHILO64_MULHI_INTRIN (defined(__powerpc64__))
+#endif
+
+#ifndef R123_MULHILO64_MULHI_INTRIN
+#define R123_MULHILO64_MULHI_INTRIN __mulhdu
+#endif
+
+#ifndef R123_USE_MULHILO32_MULHI_INTRIN
+#define R123_USE_MULHILO32_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_MULHILO32_MULHI_INTRIN
+#define R123_MULHILO32_MULHI_INTRIN __mulhwu
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM (defined(__powerpc64__) && !(R123_USE_MULHILO64_MULHI_INTRIN))
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+/* If you add something, it must go in all the other XXfeatures.hpp
+   and in ../ut_features.cpp */
+#endif
diff --git a/src/external/Random123-1.08/include/Random123/threefry.h b/src/external/Random123-1.08/include/Random123/threefry.h
new file mode 100644
index 0000000000..da2de979c6
--- /dev/null
+++ b/src/external/Random123-1.08/include/Random123/threefry.h
@@ -0,0 +1,864 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _threefry_dot_h_
+#define _threefry_dot_h_
+#include "features/compilerfeatures.h"
+#include "array.h"
+
+/** \cond HIDDEN_FROM_DOXYGEN */
+/* Significant parts of this file were copied from
+   from:
+      Skein_FinalRnd/ReferenceImplementation/skein.h
+      Skein_FinalRnd/ReferenceImplementation/skein_block.c
+
+   in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
+
+   This file has been modified so that it may no longer perform its originally
+   intended function.  If you're looking for a Skein or Threefish source code,
+   please consult the original file.
+
+   The original file had the following header:
+**************************************************************************
+**
+** Interface declarations and internal definitions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+***************************************************************************
+
+*/
+
+/* See comment at the top of philox.h for the macro pre-process
+   strategy. */
+
+/* Rotation constants: */
+enum r123_enum_threefry64x4 {
+    /* These are the R_256 constants from the Threefish reference sources
+       with names changed to R_64x4... */
+    R_64x4_0_0=14, R_64x4_0_1=16,
+    R_64x4_1_0=52, R_64x4_1_1=57,
+    R_64x4_2_0=23, R_64x4_2_1=40,
+    R_64x4_3_0= 5, R_64x4_3_1=37,
+    R_64x4_4_0=25, R_64x4_4_1=33,
+    R_64x4_5_0=46, R_64x4_5_1=12,
+    R_64x4_6_0=58, R_64x4_6_1=22,
+    R_64x4_7_0=32, R_64x4_7_1=32
+};
+
+enum r123_enum_threefry64x2 {
+    /*
+    // Output from skein_rot_search: (srs64_B64-X1000)
+    // Random seed = 1. BlockSize = 128 bits. sampleCnt =  1024. rounds =  8, minHW_or=57
+    // Start: Tue Mar  1 10:07:48 2011
+    // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format   
+    */
+    R_64x2_0_0=16,
+    R_64x2_1_0=42,
+    R_64x2_2_0=12,
+    R_64x2_3_0=31,
+    R_64x2_4_0=16,
+    R_64x2_5_0=32,
+    R_64x2_6_0=24,
+    R_64x2_7_0=21
+    /* 4 rounds: minHW =  4  [  4  4  4  4 ]
+    // 5 rounds: minHW =  8  [  8  8  8  8 ]
+    // 6 rounds: minHW = 16  [ 16 16 16 16 ]
+    // 7 rounds: minHW = 32  [ 32 32 32 32 ]
+    // 8 rounds: minHW = 64  [ 64 64 64 64 ]
+    // 9 rounds: minHW = 64  [ 64 64 64 64 ]
+    //10 rounds: minHW = 64  [ 64 64 64 64 ]
+    //11 rounds: minHW = 64  [ 64 64 64 64 ] */
+};
+
+enum r123_enum_threefry32x4 {
+    /* Output from skein_rot_search: (srs-B128-X5000.out)
+    // Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
+    // Start: Mon Aug 24 22:41:36 2009
+    // ...
+    // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format    */
+    R_32x4_0_0=10, R_32x4_0_1=26,
+    R_32x4_1_0=11, R_32x4_1_1=21,
+    R_32x4_2_0=13, R_32x4_2_1=27,
+    R_32x4_3_0=23, R_32x4_3_1= 5,
+    R_32x4_4_0= 6, R_32x4_4_1=20,
+    R_32x4_5_0=17, R_32x4_5_1=11,
+    R_32x4_6_0=25, R_32x4_6_1=10,
+    R_32x4_7_0=18, R_32x4_7_1=20
+
+    /* 4 rounds: minHW =  3  [  3  3  3  3 ]
+    // 5 rounds: minHW =  7  [  7  7  7  7 ]
+    // 6 rounds: minHW = 12  [ 13 12 13 12 ]
+    // 7 rounds: minHW = 22  [ 22 23 22 23 ]
+    // 8 rounds: minHW = 31  [ 31 31 31 31 ]
+    // 9 rounds: minHW = 32  [ 32 32 32 32 ]
+    //10 rounds: minHW = 32  [ 32 32 32 32 ]
+    //11 rounds: minHW = 32  [ 32 32 32 32 ] */
+
+};
+
+enum r123_enum_threefry32x2 {
+    /* Output from skein_rot_search (srs32x2-X5000.out)
+    // Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
+    // Start: Tue Jul 12 11:11:33 2011
+    // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize=  64].format   */
+    R_32x2_0_0=13,
+    R_32x2_1_0=15,
+    R_32x2_2_0=26,
+    R_32x2_3_0= 6,
+    R_32x2_4_0=17,
+    R_32x2_5_0=29,
+    R_32x2_6_0=16,
+    R_32x2_7_0=24
+
+    /* 4 rounds: minHW =  4  [  4  4  4  4 ]
+    // 5 rounds: minHW =  6  [  6  8  6  8 ]
+    // 6 rounds: minHW =  9  [  9 12  9 12 ]
+    // 7 rounds: minHW = 16  [ 16 24 16 24 ]
+    // 8 rounds: minHW = 32  [ 32 32 32 32 ]
+    // 9 rounds: minHW = 32  [ 32 32 32 32 ]
+    //10 rounds: minHW = 32  [ 32 32 32 32 ]
+    //11 rounds: minHW = 32  [ 32 32 32 32 ] */
+    };
+
+enum r123_enum_threefry_wcnt {
+    WCNT2=2,
+    WCNT4=4
+};
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
+R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
+{
+    return (x << (N & 63)) | (x >> ((64-N) & 63));
+}
+    
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
+R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
+{
+    return (x << (N & 31)) | (x >> ((32-N) & 31));
+}
+
+#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
+#define SKEIN_KS_PARITY64         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+#define SKEIN_KS_PARITY32         0x1BD11BDA
+
+#ifndef THREEFRY2x32_DEFAULT_ROUNDS
+#define THREEFRY2x32_DEFAULT_ROUNDS 20
+#endif
+
+#ifndef THREEFRY2x64_DEFAULT_ROUNDS
+#define THREEFRY2x64_DEFAULT_ROUNDS 20
+#endif
+
+#ifndef THREEFRY4x32_DEFAULT_ROUNDS
+#define THREEFRY4x32_DEFAULT_ROUNDS 20
+#endif
+
+#ifndef THREEFRY4x64_DEFAULT_ROUNDS
+#define THREEFRY4x64_DEFAULT_ROUNDS 20
+#endif
+
+#define _threefry2x_tpl(W)                                              \
+typedef struct r123array2x##W threefry2x##W##_ctr_t;                          \
+typedef struct r123array2x##W threefry2x##W##_key_t;                          \
+typedef struct r123array2x##W threefry2x##W##_ukey_t;                          \
+R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE                                          \
+threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
+    threefry2x##W##_ctr_t X;                                              \
+    uint##W##_t ks[2+1];                                          \
+    int  i; /* avoid size_t to avoid need for stddef.h */                   \
+    R123_ASSERT(Nrounds<=32);                                           \
+    ks[2] =  SKEIN_KS_PARITY##W;                                   \
+    for (i=0;i < 2; i++)                                        \
+        {                                                               \
+            ks[i] = k.v[i];                                             \
+            X.v[i]  = in.v[i];                                          \
+            ks[2] ^= k.v[i];                                    \
+        }                                                               \
+                                                                        \
+    /* Insert initial key before round 0 */                             \
+    X.v[0] += ks[0]; X.v[1] += ks[1];                                   \
+                                                                        \
+    if(Nrounds>0){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>1){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>2){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>3){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>3){                                                      \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2];                               \
+        X.v[1] += 1;     /* X.v[2-1] += r  */                   \
+    }                                                                   \
+    if(Nrounds>4){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>5){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>6){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>7){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>7){                                                      \
+        /* InjectKey(r=2) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[0];                               \
+        X.v[1] += 2;                                                    \
+    }                                                                   \
+    if(Nrounds>8){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>9){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>10){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>11){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>11){                                                     \
+        /* InjectKey(r=3) */                                            \
+        X.v[0] += ks[0]; X.v[1] += ks[1];                               \
+        X.v[1] += 3;                                                    \
+    }                                                                   \
+    if(Nrounds>12){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>13){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>14){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>15){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>15){                                                     \
+        /* InjectKey(r=4) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2];                               \
+        X.v[1] += 4;                                                    \
+    }                                                                   \
+    if(Nrounds>16){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>17){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>18){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>19){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>19){                                                     \
+        /* InjectKey(r=5) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[0];                               \
+        X.v[1] += 5;                                                    \
+    }                                                                   \
+    if(Nrounds>20){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>21){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>22){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>23){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>23){                                                     \
+        /* InjectKey(r=6) */                                            \
+        X.v[0] += ks[0]; X.v[1] += ks[1];                               \
+        X.v[1] += 6;                                                    \
+    }                                                                   \
+    if(Nrounds>24){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>25){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>26){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>27){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>27){                                                     \
+        /* InjectKey(r=7) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2];                               \
+        X.v[1] += 7;                                                    \
+    }                                                                   \
+    if(Nrounds>28){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>29){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>30){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>31){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>31){                                                     \
+        /* InjectKey(r=8) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[0];                               \
+        X.v[1] += 8;                                                    \
+    }                                                                   \
+    return X;                                                           \
+}                                                                       \
+ /** @ingroup ThreefryNxW */                                            \
+enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS };       \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE                                     \
+threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
+    return threefry2x##W##_R(threefry2x##W##_rounds, in, k);            \
+}
+
+
+#define _threefry4x_tpl(W)                                              \
+typedef struct r123array4x##W threefry4x##W##_ctr_t;                        \
+typedef struct r123array4x##W threefry4x##W##_key_t;                        \
+typedef struct r123array4x##W threefry4x##W##_ukey_t;                        \
+R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE                                          \
+threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
+    threefry4x##W##_ctr_t X;                                            \
+    uint##W##_t ks[4+1];                                            \
+    int  i; /* avoid size_t to avoid need for stddef.h */                   \
+    R123_ASSERT(Nrounds<=72);                                           \
+    ks[4] =  SKEIN_KS_PARITY##W;                                    \
+    for (i=0;i < 4; i++)                                            \
+        {                                                               \
+            ks[i] = k.v[i];                                             \
+            X.v[i]  = in.v[i];                                          \
+            ks[4] ^= k.v[i];                                        \
+        }                                                               \
+                                                                        \
+    /* Insert initial key before round 0 */                             \
+    X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
+                                                                        \
+    if(Nrounds>0){                                                      \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>1){                                                      \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>2){                                                      \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>3){                                                      \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>3){                                                      \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
+        X.v[4-1] += 1;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>4){                                                      \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>5){                                                      \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>6){                                                      \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>7){                                                      \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>7){                                                      \
+        /* InjectKey(r=2) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
+        X.v[4-1] += 2;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>8){                                                      \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>9){                                                      \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>10){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>11){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>11){                                                     \
+        /* InjectKey(r=3) */                                            \
+        X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
+        X.v[4-1] += 3;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>12){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>13){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>14){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>15){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>15){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
+        X.v[4-1] += 4;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>16){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>17){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>18){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>19){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>19){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
+        X.v[4-1] += 5;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>20){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>21){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>22){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>23){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>23){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
+        X.v[4-1] += 6;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>24){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>25){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>26){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>27){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>27){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
+        X.v[4-1] += 7;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>28){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>29){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>30){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>31){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>31){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
+        X.v[4-1] += 8;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>32){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>33){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>34){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>35){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>35){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
+        X.v[4-1] += 9;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>36){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>37){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>38){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>39){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>39){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
+        X.v[4-1] += 10;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>40){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>41){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>42){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>43){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>43){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
+        X.v[4-1] += 11;     /* X.v[WCNT4-1] += r  */                \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>44){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>45){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>46){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>47){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>47){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
+        X.v[4-1] += 12;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>48){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>49){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>50){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>51){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>51){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
+        X.v[4-1] += 13;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>52){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>53){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>54){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>55){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>55){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
+        X.v[4-1] += 14;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>56){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>57){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>58){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>59){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>59){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
+        X.v[4-1] += 15;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>60){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>61){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>62){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>63){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>63){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
+        X.v[4-1] += 16;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>64){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>65){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>66){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>67){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>67){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
+        X.v[4-1] += 17;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>68){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>69){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>70){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>71){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>71){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
+        X.v[4-1] += 18;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    return X;                                                           \
+}                                                                       \
+ /** @ingroup ThreefryNxW */                                            \
+enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS };       \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE                                     \
+threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
+    return threefry4x##W##_R(threefry4x##W##_rounds, in, k);            \
+}
+/** \endcond */
+
+_threefry2x_tpl(64)
+_threefry2x_tpl(32)
+_threefry4x_tpl(64)
+_threefry4x_tpl(32)
+
+/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
+   than a static inline function.  Why?  */
+#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
+#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
+#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
+#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
+
+#ifdef __cplusplus
+/** \cond HIDDEN_FROM_DOXYGEN */
+#define _threefryNxWclass_tpl(NxW)                                      \
+namespace r123{                                                     \
+template<unsigned int R>                                                  \
+ struct Threefry##NxW##_R{                                              \
+    typedef threefry##NxW##_ctr_t ctr_type;                             \
+    typedef threefry##NxW##_key_t key_type;                             \
+    typedef threefry##NxW##_key_t ukey_type;                            \
+    static const unsigned int rounds=R;                                 \
+   inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
+        R123_STATIC_ASSERT(R<=72, "threefry is only unrolled up to 72 rounds\n"); \
+        return threefry##NxW##_R(R, ctr, key);                              \
+    }                                                                   \
+};                                                                      \
+ typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW;       \
+} // namespace r123
+
+/** \endcond */
+
+_threefryNxWclass_tpl(2x32)
+_threefryNxWclass_tpl(4x32)
+_threefryNxWclass_tpl(2x64)
+_threefryNxWclass_tpl(4x64)
+
+/* The _tpl macros don't quite work to do string-pasting inside comments.
+   so we just write out the boilerplate documentation four times... */
+
+/** 
+@defgroup ThreefryNxW Threefry Classes and Typedefs
+
+The ThreefryNxW classes export the member functions, typedefs and
+operator overloads required by a @ref CBRNG "CBRNG" class.
+
+As described in  
+<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers:  As Easy as 1, 2, 3</i> </a>, 
+the Threefry family is closely related to the Threefish block cipher from
+<a href="http://www.skein-hash.info/"> Skein Hash Function</a>.  
+Threefry is \b not suitable for cryptographic use.
+
+Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output.
+
+@class r123::Threefry2x32_R 
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=13 or more for Threefry2x32.
+
+@typedef r123::Threefry2x32
+@ingroup ThreefryNxW
+  Threefry2x32 is equivalent to Threefry2x32_R<20>.    With 20 rounds,
+  Threefry2x32 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+
+@class r123::Threefry2x64_R 
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+In November 2011, the authors discovered that 13 rounds of
+Threefry2x64 sequenced by strided, interleaved key and counter
+increments failed a very long (longer than the default BigCrush
+length) WeightDistrub test.  At the same time, it was confirmed that
+14 rounds passes much longer tests (up to 5x10^12 samples) of a
+similar nature.  The authors know of no statistical flaws with
+ROUNDS=14 or more for Threefry2x64.
+
+@typedef r123::Threefry2x64
+@ingroup ThreefryNxW
+  Threefry2x64 is equivalent to Threefry2x64_R<20>.    With 20 rounds,
+  Threefry2x64 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+
+
+
+@class r123::Threefry4x32_R 
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=12 or more for Threefry4x32.
+
+@typedef r123::Threefry4x32
+@ingroup ThreefryNxW
+  Threefry4x32 is equivalent to Threefry4x32_R<20>.    With 20 rounds,
+  Threefry4x32 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+
+
+
+@class r123::Threefry4x64_R 
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=12 or more for Threefry4x64.
+
+@typedef r123::Threefry4x64
+@ingroup ThreefryNxW
+  Threefry4x64 is equivalent to Threefry4x64_R<20>.    With 20 rounds,
+  Threefry4x64 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+*/
+
+#endif
+
+#endif
diff --git a/src/gromacs/fileio/trajectory_writing.c b/src/gromacs/fileio/trajectory_writing.c
index 813c5b8036..3cea7d40f9 100644
--- a/src/gromacs/fileio/trajectory_writing.c
+++ b/src/gromacs/fileio/trajectory_writing.c
@@ -61,14 +61,12 @@ do_md_trajectory_writing(FILE           *fplog,
                          t_state        *state_global,
                          gmx_mtop_t     *top_global,
                          t_forcerec     *fr,
-                         gmx_update_t    upd,
                          gmx_mdoutf_t    outf,
                          t_mdebin       *mdebin,
                          gmx_ekindata_t *ekind,
                          rvec           *f,
                          rvec           *f_global,
                          gmx_wallcycle_t wcycle,
-                         gmx_rng_t       mcrng,
                          int            *nchkpt,
                          gmx_bool        bCPT,
                          gmx_bool        bRerunMD,
@@ -131,14 +129,6 @@ do_md_trajectory_writing(FILE           *fplog,
         wallcycle_start(wcycle, ewcTRAJ);
         if (bCPT)
         {
-            if (state->flags & (1<<estLD_RNG))
-            {
-                get_stochd_state(upd, state);
-            }
-            if (state->flags  & (1<<estMC_RNG))
-            {
-                get_mc_state(mcrng, state);
-            }
             if (MASTER(cr))
             {
                 if (bSumEkinhOld)
diff --git a/src/gromacs/fileio/trajectory_writing.h b/src/gromacs/fileio/trajectory_writing.h
index 1bfc38d9ba..b4355bad13 100644
--- a/src/gromacs/fileio/trajectory_writing.h
+++ b/src/gromacs/fileio/trajectory_writing.h
@@ -43,7 +43,6 @@
 #include "mdoutf.h"
 #include "../legacyheaders/types/simple.h"
 #include "../legacyheaders/types/commrec.h"
-#include "../legacyheaders/update.h"
 #include "../legacyheaders/mdebin.h"
 
 /*! \brief Wrapper routine for writing trajectories during mdrun
@@ -63,14 +62,12 @@ do_md_trajectory_writing(FILE           *fplog,
                          t_state        *state_global,
                          gmx_mtop_t     *top_global,
                          t_forcerec     *fr,
-                         gmx_update_t    upd,
                          gmx_mdoutf_t    outf,
                          t_mdebin       *mdebin,
                          gmx_ekindata_t *ekind,
                          rvec           *f,
                          rvec           *f_global,
                          gmx_wallcycle_t wcycle,
-                         gmx_rng_t       mcrng,
                          int            *nchkpt,
                          gmx_bool        bCPT,
                          gmx_bool        bRerunMD,
diff --git a/src/gromacs/gmxlib/checkpoint.c b/src/gromacs/gmxlib/checkpoint.c
index 731f60614d..438b8c30af 100644
--- a/src/gromacs/gmxlib/checkpoint.c
+++ b/src/gromacs/gmxlib/checkpoint.c
@@ -65,7 +65,6 @@
 #include "txtdump.h"
 #include "vec.h"
 #include "network.h"
-#include "gromacs/random/random.h"
 #include "checkpoint.h"
 #include "main.h"
 #include "string2.h"
@@ -983,10 +982,9 @@ static int do_cpt_footer(XDR *xd, int file_version)
 
 static int do_cpt_state(XDR *xd, gmx_bool bRead,
                         int fflags, t_state *state,
-                        gmx_bool bReadRNG, FILE *list)
+                        FILE *list)
 {
     int    sflags;
-    int  **rng_p, **rngi_p;
     int    i;
     int    ret;
     int    nnht, nnhtp;
@@ -996,25 +994,11 @@ static int do_cpt_state(XDR *xd, gmx_bool bRead,
     nnht  = state->nhchainlength*state->ngtc;
     nnhtp = state->nhchainlength*state->nnhpres;
 
-    if (bReadRNG)
-    {
-        rng_p  = (int **)&state->ld_rng;
-        rngi_p = &state->ld_rngi;
-    }
-    else
-    {
-        /* Do not read the RNG data */
-        rng_p  = NULL;
-        rngi_p = NULL;
-    }
-
     if (bRead) /* we need to allocate space for dfhist if we are reading */
     {
         init_df_history(&state->dfhist, state->dfhist.nlambda);
     }
 
-    /* We want the MC_RNG the same across all the notes for now -- lambda MC is global */
-
     sflags = state->flags;
     for (i = 0; (i < estNR && ret == 0); i++)
     {
@@ -1040,10 +1024,13 @@ static int do_cpt_state(XDR *xd, gmx_bool bRead,
                 case estX:       ret      = do_cpte_rvecs(xd, cptpEST, i, sflags, state->natoms, &state->x, list); break;
                 case estV:       ret      = do_cpte_rvecs(xd, cptpEST, i, sflags, state->natoms, &state->v, list); break;
                 case estSDX:     ret      = do_cpte_rvecs(xd, cptpEST, i, sflags, state->natoms, &state->sd_X, list); break;
-                case estLD_RNG:  ret      = do_cpte_ints(xd, cptpEST, i, sflags, state->nrng, rng_p, list); break;
-                case estLD_RNGI: ret      = do_cpte_ints(xd, cptpEST, i, sflags, state->nrngi, rngi_p, list); break;
-                case estMC_RNG:  ret      = do_cpte_ints(xd, cptpEST, i, sflags, state->nmcrng, (int **)&state->mc_rng, list); break;
-                case estMC_RNGI: ret      = do_cpte_ints(xd, cptpEST, i, sflags, 1, &state->mc_rngi, list); break;
+                /* The RNG entries are no longer written,
+                 * the next 4 lines are only for reading old files.
+                 */
+                case estLD_RNG:  ret      = do_cpte_ints(xd, cptpEST, i, sflags, 0, NULL, list); break;
+                case estLD_RNGI: ret      = do_cpte_ints(xd, cptpEST, i, sflags, 0, NULL, list); break;
+                case estMC_RNG:  ret      = do_cpte_ints(xd, cptpEST, i, sflags, 0, NULL, list); break;
+                case estMC_RNGI: ret      = do_cpte_ints(xd, cptpEST, i, sflags, 0, NULL, list); break;
                 case estDISRE_INITF:  ret = do_cpte_real (xd, cptpEST, i, sflags, &state->hist.disre_initf, list); break;
                 case estDISRE_RM3TAV: ret = do_cpte_n_reals(xd, cptpEST, i, sflags, &state->hist.ndisrepairs, &state->hist.disre_rm3tav, list); break;
                 case estORIRE_INITF:  ret = do_cpte_real (xd, cptpEST, i, sflags, &state->hist.orire_initf, list); break;
@@ -1654,7 +1641,7 @@ void write_checkpoint(const char *fn, gmx_bool bNumberAndKeep,
     sfree(bhost);
     sfree(fprog);
 
-    if ((do_cpt_state(gmx_fio_getxdr(fp), FALSE, state->flags, state, TRUE, NULL) < 0)        ||
+    if ((do_cpt_state(gmx_fio_getxdr(fp), FALSE, state->flags, state, NULL) < 0)        ||
         (do_cpt_ekinstate(gmx_fio_getxdr(fp), flags_eks, &state->ekinstate, NULL) < 0) ||
         (do_cpt_enerhist(gmx_fio_getxdr(fp), FALSE, flags_enh, &state->enerhist, NULL) < 0)  ||
         (do_cpt_df_hist(gmx_fio_getxdr(fp), flags_dfh, &state->dfhist, NULL) < 0)  ||
@@ -1847,7 +1834,7 @@ static void check_match(FILE *fplog,
 static void read_checkpoint(const char *fn, FILE **pfplog,
                             t_commrec *cr, ivec dd_nc,
                             int eIntegrator, int *init_fep_state, gmx_int64_t *step, double *t,
-                            t_state *state, gmx_bool *bReadRNG, gmx_bool *bReadEkin,
+                            t_state *state, gmx_bool *bReadEkin,
                             int *simulation_part,
                             gmx_bool bAppendOutputFiles, gmx_bool bForceAppend)
 {
@@ -1875,10 +1862,6 @@ static void read_checkpoint(const char *fn, FILE **pfplog,
     const char *int_warn =
         "WARNING: The checkpoint file was generated with integrator %s,\n"
         "         while the simulation uses integrator %s\n\n";
-    const char *sd_note =
-        "NOTE: The checkpoint file was for %d nodes doing SD or BD,\n"
-        "      while the simulation uses %d SD or BD nodes,\n"
-        "      continuation will be exact, except for the random state\n\n";
 
 #if !defined __native_client__ && !defined GMX_NATIVE_WINDOWS
     fl.l_type   = F_WRLCK;
@@ -1995,16 +1978,6 @@ static void read_checkpoint(const char *fn, FILE **pfplog,
         nppnodes = -1;
     }
 
-    if ((EI_SD(eIntegrator) || eIntegrator == eiBD) && nppnodes > 0)
-    {
-        /* Correct the RNG state size for the number of PP nodes.
-         * Such assignments should all be moved to one central function.
-         */
-        state->nrng  = nppnodes*gmx_rng_n();
-        state->nrngi = nppnodes;
-    }
-
-    *bReadRNG = TRUE;
     if (fflags != state->flags)
     {
 
@@ -2037,26 +2010,13 @@ static void read_checkpoint(const char *fn, FILE **pfplog,
     }
     else
     {
-        if ((EI_SD(eIntegrator) || eIntegrator == eiBD) &&
-            nppnodes != nppnodes_f)
-        {
-            *bReadRNG = FALSE;
-            if (MASTER(cr))
-            {
-                fprintf(stderr, sd_note, nppnodes_f, nppnodes);
-            }
-            if (fplog)
-            {
-                fprintf(fplog, sd_note, nppnodes_f, nppnodes);
-            }
-        }
         if (MASTER(cr))
         {
             check_match(fplog, version, btime, buser, bhost, double_prec, fprog,
                         cr, nppnodes_f, npmenodes_f, dd_nc, dd_nc_f);
         }
     }
-    ret             = do_cpt_state(gmx_fio_getxdr(fp), TRUE, fflags, state, *bReadRNG, NULL);
+    ret             = do_cpt_state(gmx_fio_getxdr(fp), TRUE, fflags, state, NULL);
     *init_fep_state = state->fep_state;  /* there should be a better way to do this than setting it here.
                                             Investigate for 5.0. */
     if (ret)
@@ -2279,7 +2239,7 @@ static void read_checkpoint(const char *fn, FILE **pfplog,
 void load_checkpoint(const char *fn, FILE **fplog,
                      t_commrec *cr, ivec dd_nc,
                      t_inputrec *ir, t_state *state,
-                     gmx_bool *bReadRNG, gmx_bool *bReadEkin,
+                     gmx_bool *bReadEkin,
                      gmx_bool bAppend, gmx_bool bForceAppend)
 {
     gmx_int64_t     step;
@@ -2290,7 +2250,7 @@ void load_checkpoint(const char *fn, FILE **fplog,
         /* Read the state from the checkpoint file */
         read_checkpoint(fn, fplog,
                         cr, dd_nc,
-                        ir->eI, &(ir->fepvals->init_fep_state), &step, &t, state, bReadRNG, bReadEkin,
+                        ir->eI, &(ir->fepvals->init_fep_state), &step, &t, state, bReadEkin,
                         &ir->simulation_part, bAppend, bForceAppend);
     }
     if (PAR(cr))
@@ -2298,7 +2258,6 @@ void load_checkpoint(const char *fn, FILE **fplog,
         gmx_bcast(sizeof(cr->npmenodes), &cr->npmenodes, cr);
         gmx_bcast(DIM*sizeof(dd_nc[0]), dd_nc, cr);
         gmx_bcast(sizeof(step), &step, cr);
-        gmx_bcast(sizeof(*bReadRNG), bReadRNG, cr);
         gmx_bcast(sizeof(*bReadEkin), bReadEkin, cr);
     }
     ir->bContinuation    = TRUE;
@@ -2312,7 +2271,6 @@ void load_checkpoint(const char *fn, FILE **fplog,
 
 static void read_checkpoint_data(t_fileio *fp, int *simulation_part,
                                  gmx_int64_t *step, double *t, t_state *state,
-                                 gmx_bool bReadRNG,
                                  int *nfiles, gmx_file_position_t **outputfiles)
 {
     int                  file_version;
@@ -2333,7 +2291,7 @@ static void read_checkpoint_data(t_fileio *fp, int *simulation_part,
                   &(state->dfhist.nlambda), &state->flags, &flags_eks, &flags_enh, &flags_dfh,
                   &state->edsamstate.nED, &state->swapstate.eSwapCoords, NULL);
     ret =
-        do_cpt_state(gmx_fio_getxdr(fp), TRUE, state->flags, state, bReadRNG, NULL);
+        do_cpt_state(gmx_fio_getxdr(fp), TRUE, state->flags, state, NULL);
     if (ret)
     {
         cp_error();
@@ -2401,7 +2359,7 @@ read_checkpoint_state(const char *fn, int *simulation_part,
     t_fileio *fp;
 
     fp = gmx_fio_open(fn, "r");
-    read_checkpoint_data(fp, simulation_part, step, t, state, FALSE, NULL, NULL);
+    read_checkpoint_data(fp, simulation_part, step, t, state, NULL, NULL);
     if (gmx_fio_close(fp) != 0)
     {
         gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
@@ -2421,7 +2379,7 @@ void read_checkpoint_trxframe(t_fileio *fp, t_trxframe *fr)
 
     init_state(&state, 0, 0, 0, 0, 0);
 
-    read_checkpoint_data(fp, &simulation_part, &step, &t, &state, FALSE, NULL, NULL);
+    read_checkpoint_data(fp, &simulation_part, &step, &t, &state, NULL, NULL);
 
     fr->natoms  = state.natoms;
     fr->bTitle  = FALSE;
@@ -2483,7 +2441,7 @@ void list_checkpoint(const char *fn, FILE *out)
                   &(state.dfhist.nlambda), &state.flags,
                   &flags_eks, &flags_enh, &flags_dfh, &state.edsamstate.nED,
                   &state.swapstate.eSwapCoords, out);
-    ret = do_cpt_state(gmx_fio_getxdr(fp), TRUE, state.flags, &state, TRUE, out);
+    ret = do_cpt_state(gmx_fio_getxdr(fp), TRUE, state.flags, &state, out);
     if (ret)
     {
         cp_error();
@@ -2585,7 +2543,7 @@ gmx_bool read_checkpoint_simulation_part(const char *filename, int *simulation_p
         {
             init_state(&state, 0, 0, 0, 0, 0);
 
-            read_checkpoint_data(fp, simulation_part, &step, &t, &state, FALSE,
+            read_checkpoint_data(fp, simulation_part, &step, &t, &state,
                                  &nfiles, &outputfiles);
             if (gmx_fio_close(fp) != 0)
             {
diff --git a/src/gromacs/gmxlib/mvdata.c b/src/gromacs/gmxlib/mvdata.c
index b7e6ebfe14..1e491b25aa 100644
--- a/src/gromacs/gmxlib/mvdata.c
+++ b/src/gromacs/gmxlib/mvdata.c
@@ -257,8 +257,6 @@ void bcast_state(const t_commrec *cr, t_state *state)
     block_bc(cr, state->ngtc);
     block_bc(cr, state->nnhpres);
     block_bc(cr, state->nhchainlength);
-    block_bc(cr, state->nrng);
-    block_bc(cr, state->nrngi);
     block_bc(cr, state->flags);
     if (state->lambda == NULL)
     {
@@ -309,16 +307,6 @@ void bcast_state(const t_commrec *cr, t_state *state)
                 case estV:       nblock_abc(cr, state->natoms, state->v); break;
                 case estSDX:     nblock_abc(cr, state->natoms, state->sd_X); break;
                 case estCGP:     nblock_abc(cr, state->natoms, state->cg_p); break;
-                case estLD_RNG:  if (state->nrngi == 1)
-                    {
-                        nblock_abc(cr, state->nrng, state->ld_rng);
-                    }
-                    break;
-                case estLD_RNGI: if (state->nrngi == 1)
-                    {
-                        nblock_abc(cr, state->nrngi, state->ld_rngi);
-                    }
-                    break;
                 case estDISRE_INITF: block_bc(cr, state->hist.disre_initf); break;
                 case estDISRE_RM3TAV:
                     block_bc(cr, state->hist.ndisrepairs);
diff --git a/src/gromacs/gmxlib/typedefs.c b/src/gromacs/gmxlib/typedefs.c
index a230767f6e..460b5a3b4e 100644
--- a/src/gromacs/gmxlib/typedefs.c
+++ b/src/gromacs/gmxlib/typedefs.c
@@ -622,7 +622,6 @@ void init_state(t_state *state, int natoms, int ngtc, int nnhpres, int nhchainle
     int i;
 
     state->natoms = natoms;
-    state->nrng   = 0;
     state->flags  = 0;
     state->lambda = 0;
     snew(state->lambda, efptNR);
@@ -713,19 +712,6 @@ t_state *serial_init_local_state(t_state *state_global)
     {
         state_local->lambda[i] = state_global->lambda[i];
     }
-    if (state_global->nrngi > 1)
-    {
-        /* With stochastic dynamics we need local storage for the random state */
-        if (state_local->flags & (1<<estLD_RNG))
-        {
-            state_local->nrng = gmx_rng_n();
-            snew(state_local->ld_rng, state_local->nrng);
-        }
-        if (state_local->flags & (1<<estLD_RNGI))
-        {
-            snew(state_local->ld_rngi, 1);
-        }
-    }
 
     return state_local;
 }
diff --git a/src/gromacs/legacyheaders/checkpoint.h b/src/gromacs/legacyheaders/checkpoint.h
index ae9e3a3370..0db48d6527 100644
--- a/src/gromacs/legacyheaders/checkpoint.h
+++ b/src/gromacs/legacyheaders/checkpoint.h
@@ -73,8 +73,9 @@ void write_checkpoint(const char *fn, gmx_bool bNumberAndKeep,
  */
 void load_checkpoint(const char *fn, FILE **fplog,
                      t_commrec *cr, ivec dd_nc,
-                     t_inputrec *ir, t_state *state, gmx_bool *bReadRNG,
-                     gmx_bool *bReadEkin, gmx_bool bAppend, gmx_bool bForceAppend);
+                     t_inputrec *ir, t_state *state,
+                     gmx_bool *bReadEkin,
+                     gmx_bool bAppend, gmx_bool bForceAppend);
 
 /* Read the state from checkpoint file.
  * Arrays in state that are NULL are allocated.
diff --git a/src/gromacs/legacyheaders/mdrun.h b/src/gromacs/legacyheaders/mdrun.h
index 09a54b152f..5ac463ef8a 100644
--- a/src/gromacs/legacyheaders/mdrun.h
+++ b/src/gromacs/legacyheaders/mdrun.h
@@ -67,7 +67,6 @@ extern "C" {
 #define MD_DDBONDCOMM     (1<<11)
 #define MD_CONFOUT        (1<<12)
 #define MD_REPRODUCIBLE   (1<<13)
-#define MD_READ_RNG       (1<<14)
 #define MD_APPENDFILES    (1<<15)
 #define MD_APPENDFILESSET (1<<21)
 #define MD_KEEPANDNUMCPT  (1<<16)
@@ -145,29 +144,24 @@ gmx_integrator_t do_tpi;
 
 void init_npt_masses(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bInit);
 
-void init_expanded_ensemble(gmx_bool bStateFromCP, t_inputrec *ir, gmx_rng_t *mcrng, df_history_t *dfhist);
+void init_expanded_ensemble(gmx_bool bStateFromCP, t_inputrec *ir, df_history_t *dfhist);
 
 int ExpandedEnsembleDynamics(FILE *log, t_inputrec *ir, gmx_enerdata_t *enerd,
                              t_state *state, t_extmass *MassQ, int fep_state, df_history_t *dfhist,
-                             gmx_int64_t step, gmx_rng_t mcrng,
+                             gmx_int64_t step,
                              rvec *v, t_mdatoms *mdatoms);
 
 void PrintFreeEnergyInfoToFile(FILE *outfile, t_lambda *fep, t_expanded *expand, t_simtemp *simtemp, df_history_t *dfhist,
                                int fep_state, int frequency, gmx_int64_t step);
 
-void get_mc_state(gmx_rng_t rng, t_state *state);
-
-void set_mc_state(gmx_rng_t rng, t_state *state);
-
 /* check the version */
 void check_ir_old_tpx_versions(t_commrec *cr, FILE *fplog,
                                t_inputrec *ir, gmx_mtop_t *mtop);
 
 /* Allocate and initialize node-local state entries. */
-void set_state_entries(t_state *state, const t_inputrec *ir, int nnodes);
+void set_state_entries(t_state *state, const t_inputrec *ir);
 
-/* Broadcast the data for a simulation, and allocate node-specific settings
-   such as rng generators. */
+/* Broadcast the data for a simulation, and allocate node-specific settings */
 void init_parallel(t_commrec *cr, t_inputrec *inputrec,
                    gmx_mtop_t *mtop);
 
diff --git a/src/gromacs/legacyheaders/types/state.h b/src/gromacs/legacyheaders/types/state.h
index 4acee9803b..100dce0736 100644
--- a/src/gromacs/legacyheaders/types/state.h
+++ b/src/gromacs/legacyheaders/types/state.h
@@ -217,9 +217,7 @@ typedef struct
     int              natoms;
     int              ngtc;
     int              nnhpres;
-    int              nhchainlength; /* number of nose-hoover chains               */
-    int              nrng;
-    int              nrngi;
+    int              nhchainlength;   /* number of nose-hoover chains               */
     int              flags;           /* Flags telling which entries are present      */
     int              fep_state;       /* indicates which of the alchemical states we are in                 */
     real            *lambda;          /* lambda vector                               */
@@ -242,13 +240,6 @@ typedef struct
     rvec            *sd_X;            /* random part of the x update for stoch. dyn.  */
     rvec            *cg_p;            /* p vector for conjugate gradient minimization */
 
-    unsigned int    *ld_rng;          /* RNG random state                           */
-    int             *ld_rngi;         /* RNG index                                  */
-
-    int              nmcrng;          /* number of RNG states                       */
-    unsigned int    *mc_rng;          /* lambda MC RNG random state                 */
-    int             *mc_rngi;         /* lambda MC RNG index                        */
-
     history_t        hist;            /* Time history for restraints                  */
 
     ekinstate_t      ekinstate;       /* The state of the kinetic energy data      */
diff --git a/src/gromacs/legacyheaders/update.h b/src/gromacs/legacyheaders/update.h
index 5ccdd64eba..398e505af5 100644
--- a/src/gromacs/legacyheaders/update.h
+++ b/src/gromacs/legacyheaders/update.h
@@ -42,7 +42,6 @@
 #include "mshift.h"
 #include "tgroup.h"
 #include "network.h"
-#include "gromacs/random/random.h"
 #include "vec.h"
 
 
@@ -72,7 +71,6 @@ void update_tcouple(gmx_int64_t       step,
                     t_inputrec       *inputrec,
                     t_state          *state,
                     gmx_ekindata_t   *ekind,
-                    gmx_update_t      upd,
                     t_extmass        *MassQ,
                     t_mdatoms        *md
                     );
@@ -107,7 +105,7 @@ void update_coords(FILE             *fplog,
 
 /* Return TRUE if OK, FALSE in case of Shake Error */
 
-extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_int64_t step, t_mdatoms *md, t_state *state, gmx_update_t upd, t_idef *idef, gmx_constr_t constr);
+extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_int64_t step, const t_commrec *cr, t_mdatoms *md, t_state *state, gmx_update_t upd, gmx_constr_t constr);
 
 void update_constraints(FILE             *fplog,
                         gmx_int64_t       step,
@@ -175,7 +173,8 @@ restore_ekinstate_from_state(t_commrec *cr,
 
 void berendsen_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt);
 
-void andersen_tcoupl(t_inputrec *ir, t_mdatoms *md, t_state *state, gmx_rng_t rng, real rate, t_idef *idef, int nblocks, int *sblock, gmx_bool *randatom, int *randatom_list, gmx_bool *randomize, real *boltzfac);
+void andersen_tcoupl(t_inputrec *ir, gmx_int64_t step,
+                     const t_commrec *cr, const t_mdatoms *md, t_state *state, real rate, const gmx_bool *randomize, const real *boltzfac);
 
 void nosehoover_tcoupl(t_grpopts *opts, gmx_ekindata_t *ekind, real dt,
                        double xi[], double vxi[], t_extmass *MassQ);
@@ -196,9 +195,9 @@ real NPT_energy(t_inputrec *ir, t_state *state, t_extmass *MassQ);
 void NBaroT_trotter(t_grpopts *opts, real dt,
                     double xi[], double vxi[], real *veta, t_extmass *MassQ);
 
-void vrescale_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt,
-                     double therm_integral[],
-                     gmx_rng_t rng);
+void vrescale_tcoupl(t_inputrec *ir, gmx_int64_t step,
+                     gmx_ekindata_t *ekind, real dt,
+                     double therm_integral[]);
 /* Compute temperature scaling. For V-rescale it is done in update. */
 
 real vrescale_energy(t_grpopts *opts, double therm_integral[]);
diff --git a/src/gromacs/mdlib/coupling.c b/src/gromacs/mdlib/coupling.c
index 19002e5ea1..7d207b9d50 100644
--- a/src/gromacs/mdlib/coupling.c
+++ b/src/gromacs/mdlib/coupling.c
@@ -722,203 +722,50 @@ void berendsen_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt)
     }
 }
 
-static int poisson_variate(real lambda, gmx_rng_t rng)
+void andersen_tcoupl(t_inputrec *ir, gmx_int64_t step,
+                     const t_commrec *cr, const t_mdatoms *md, t_state *state, real rate, const gmx_bool *randomize, const real *boltzfac)
 {
+    const int *gatindex = (DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
+    int        i;
+    int        gc = 0;
 
-    real L;
-    int  k = 0;
-    real p = 1.0;
-
-    L = exp(-lambda);
-
-    do
-    {
-        k  = k+1;
-        p *= gmx_rng_uniform_real(rng);
-    }
-    while (p > L);
-
-    return k-1;
-}
-
-void andersen_tcoupl(t_inputrec *ir, t_mdatoms *md, t_state *state, gmx_rng_t rng, real rate, t_idef *idef, int nblocks, int *sblock, gmx_bool *randatom, int *randatom_list, gmx_bool *randomize, real *boltzfac)
-{
-    t_grpopts *opts;
-    int        i, j, k, d, len, n, ngtc, gc = 0;
-    int        nshake, nsettle, nrandom, nrand_group;
-    real       boltz, scal, reft, prand;
-    t_iatom   *iatoms;
-
-    /* convenience variables */
-    opts = &ir->opts;
-    ngtc = opts->ngtc;
+    /* randomize the velocities of the selected particles */
 
-    /* idef is only passed in if it's chance-per-particle andersen, so
-       it essentially serves as a boolean to determine which type of
-       andersen is being used */
-    if (idef)
+    for (i = 0; i < md->homenr; i++)  /* now loop over the list of atoms */
     {
+        int      ng = gatindex ? gatindex[i] : i;
+        gmx_bool bRandomize;
 
-        /* randomly atoms to randomize.  However, all constraint
-           groups have to have either all of the atoms or none of the
-           atoms randomize.
-
-           Algorithm:
-           1. Select whether or not to randomize each atom to get the correct probability.
-           2. Cycle through the constraint groups.
-              2a. for each constraint group, determine the fraction f of that constraint group that are
-                  chosen to be randomized.
-              2b. all atoms in the constraint group are randomized with probability f.
-         */
-
-        nrandom = 0;
-        if ((rate < 0.05) && (md->homenr > 50))
-        {
-            /* if the rate is relatively high, use a standard method, if low rate,
-             * use poisson */
-            /* poisson distributions approxmation, more efficient for
-             * low rates, fewer random numbers required */
-            nrandom = poisson_variate(md->homenr*rate, rng);  /* how many do we randomize? Use Poisson. */
-            /* now we know how many, choose them randomly. No worries about repeats, at this rate, it's negligible.
-               worst thing that happens, it lowers the true rate an negligible amount */
-            for (i = 0; i < nrandom; i++)
-            {
-                randatom[(int)(gmx_rng_uniform_real(rng)*md->homenr)] = TRUE;
-            }
-        }
-        else
+        if (md->cTC)
         {
-            for (i = 0; i < md->homenr; i++)
-            {
-                if (gmx_rng_uniform_real(rng) < rate)
-                {
-                    randatom[i] = TRUE;
-                    nrandom++;
-                }
-            }
+            gc = md->cTC[i];  /* assign the atom to a temperature group if there are more than one */
         }
-
-        /* instead of looping over the constraint groups, if we had a
-           list of which atoms were in which constraint groups, we
-           could then loop over only the groups that are randomized
-           now.  But that is not available now.  Create later after
-           determining whether there actually is any slowing. */
-
-        /* first, loop through the settles to make sure all groups either entirely randomized, or not randomized. */
-
-        nsettle  = idef->il[F_SETTLE].nr/2;
-        for (i = 0; i < nsettle; i++)
+        if (randomize[gc])
         {
-            iatoms      = idef->il[F_SETTLE].iatoms;
-            nrand_group = 0;
-            for (k = 0; k < 3; k++)  /* settles are always 3 atoms, hardcoded */
+            if (ir->etc == etcANDERSEN)
             {
-                if (randatom[iatoms[2*i+1]+k])
-                {
-                    nrand_group++;     /* count the number of atoms to be shaken in the settles group */
-                    randatom[iatoms[2*i+1]+k] = FALSE;
-                    nrandom--;
-                }
+                bRandomize = TRUE;
             }
-            if (nrand_group > 0)
+            else
             {
-                prand = (nrand_group)/3.0;  /* use this fraction to compute the probability the
-                                               whole group is randomized */
-                if (gmx_rng_uniform_real(rng) < prand)
-                {
-                    for (k = 0; k < 3; k++)
-                    {
-                        randatom[iatoms[2*i+1]+k] = TRUE;   /* mark them all to be randomized */
-                    }
-                    nrandom += 3;
-                }
-            }
-        }
+                double uniform[2];
 
-        /* now loop through the shake groups */
-        nshake = nblocks;
-        for (i = 0; i < nshake; i++)
-        {
-            iatoms      = &(idef->il[F_CONSTR].iatoms[sblock[i]]);
-            len         = sblock[i+1]-sblock[i];
-            nrand_group = 0;
-            for (k = 0; k < len; k++)
-            {
-                if (k%3 != 0)
-                {   /* only 2/3 of the sblock items are atoms, the others are labels */
-                    if (randatom[iatoms[k]])
-                    {
-                        nrand_group++;
-                        randatom[iatoms[k]] = FALSE;  /* need to mark it false here in case the atom is in more than
-                                                         one group in the shake block */
-                        nrandom--;
-                    }
-                }
-            }
-            if (nrand_group > 0)
-            {
-                prand = (nrand_group)/(1.0*(2*len/3));
-                if (gmx_rng_uniform_real(rng) < prand)
-                {
-                    for (k = 0; k < len; k++)
-                    {
-                        if (k%3 != 0)
-                        {                               /* only 2/3 of the sblock items are atoms, the others are labels */
-                            randatom[iatoms[k]] = TRUE; /* randomize all of them */
-                            nrandom++;
-                        }
-                    }
-                }
+                gmx_rng_cycle_2uniform(step*2, ng, ir->andersen_seed, RND_SEED_ANDERSEN, uniform);
+                bRandomize = (uniform[0] < rate);
             }
-        }
-        if (nrandom > 0)
-        {
-            n = 0;
-            for (i = 0; i < md->homenr; i++)  /* now loop over the list of atoms */
+            if (bRandomize)
             {
-                if (randatom[i])
+                real scal, gauss[3];
+                int  d;
+
+                scal = sqrt(boltzfac[gc]*md->invmass[i]);
+                gmx_rng_cycle_3gaussian_table(step*2+1, ng, ir->andersen_seed, RND_SEED_ANDERSEN, gauss);
+                for (d = 0; d < DIM; d++)
                 {
-                    randatom_list[n] = i;
-                    n++;
+                    state->v[i][d] = scal*gauss[d];
                 }
             }
-            nrandom = n;  /* there are some values of nrandom for which
-                             this algorithm won't work; for example all
-                             water molecules and nrandom =/= 3.  Better to
-                             recount and use this number (which we
-                             calculate anyway: it will not affect
-                             the average number of atoms accepted.
-                           */
-        }
-    }
-    else
-    {
-        /* if it's andersen-massive, then randomize all the atoms */
-        nrandom = md->homenr;
-        for (i = 0; i < nrandom; i++)
-        {
-            randatom_list[i] = i;
-        }
-    }
-
-    /* randomize the velocities of the selected particles */
-
-    for (i = 0; i < nrandom; i++)  /* now loop over the list of atoms */
-    {
-        n = randatom_list[i];
-        if (md->cTC)
-        {
-            gc   = md->cTC[n];  /* assign the atom to a temperature group if there are more than one */
-        }
-        if (randomize[gc])
-        {
-            scal = sqrt(boltzfac[gc]*md->invmass[n]);
-            for (d = 0; d < DIM; d++)
-            {
-                state->v[n][d] = scal*gmx_rng_gaussian_table(rng);
-            }
         }
-        randatom[n] = FALSE; /* unmark this atom for randomization */
     }
 }
 
@@ -1507,11 +1354,14 @@ real NPT_energy(t_inputrec *ir, t_state *state, t_extmass *MassQ)
     return ener_npt;
 }
 
-static real vrescale_gamdev(int ia, gmx_rng_t rng)
+static real vrescale_gamdev(int ia,
+                            gmx_int64_t step, gmx_int64_t *count,
+                            gmx_int64_t seed1, gmx_int64_t seed2)
 /* Gamma distribution, adapted from numerical recipes */
 {
-    int  j;
-    real am, e, s, v1, v2, x, y;
+    int    j;
+    real   am, e, s, v1, v2, x, y;
+    double rnd[2];
 
     if (ia < 6)
     {
@@ -1520,7 +1370,8 @@ static real vrescale_gamdev(int ia, gmx_rng_t rng)
             x = 1.0;
             for (j = 1; j <= ia; j++)
             {
-                x *= gmx_rng_uniform_real(rng);
+                gmx_rng_cycle_2uniform(step, (*count)++, seed1, seed2, rnd);
+                x *= rnd[0];
             }
         }
         while (x == 0);
@@ -1534,8 +1385,9 @@ static real vrescale_gamdev(int ia, gmx_rng_t rng)
             {
                 do
                 {
-                    v1 = gmx_rng_uniform_real(rng);
-                    v2 = 2.0*gmx_rng_uniform_real(rng)-1.0;
+                    gmx_rng_cycle_2uniform(step, (*count)++, seed1, seed2, rnd);
+                    v1 = rnd[0];
+                    v2 = 2.0*rnd[1] - 1.0;
                 }
                 while (v1*v1 + v2*v2 > 1.0 ||
                        v1*v1*GMX_REAL_MAX < 3.0*ia);
@@ -1549,44 +1401,59 @@ static real vrescale_gamdev(int ia, gmx_rng_t rng)
             }
             while (x <= 0.0);
             e = (1.0 + y*y)*exp(am*log(x/am) - s*y);
+
+            gmx_rng_cycle_2uniform(step, (*count)++, seed1, seed2, rnd);
         }
-        while (gmx_rng_uniform_real(rng) > e);
+        while (rnd[0] > e);
     }
 
     return x;
 }
 
-static real vrescale_sumnoises(int nn, gmx_rng_t rng)
+static real gaussian_count(gmx_int64_t step, gmx_int64_t *count,
+                           gmx_int64_t seed1, gmx_int64_t seed2)
+{
+    double rnd[2], x, y, r;
+
+    do
+    {
+        gmx_rng_cycle_2uniform(step, (*count)++, seed1, seed2, rnd);
+        x = 2.0*rnd[0] - 1.0;
+        y = 2.0*rnd[1] - 1.0;
+        r = x*x + y*y;
+    }
+    while (r > 1.0 || r == 0.0);
+
+    r = sqrt(-2.0*log(r)/r);
+
+    return x*r;
+}
+
+static real vrescale_sumnoises(int nn,
+                               gmx_int64_t step, gmx_int64_t *count,
+                               gmx_int64_t seed1, gmx_int64_t seed2)
 {
 /*
  * Returns the sum of n independent gaussian noises squared
  * (i.e. equivalent to summing the square of the return values
  * of nn calls to gmx_rng_gaussian_real).xs
  */
-    real rr;
+    real r, gauss;
 
-    if (nn == 0)
-    {
-        return 0.0;
-    }
-    else if (nn == 1)
-    {
-        rr = gmx_rng_gaussian_real(rng);
-        return rr*rr;
-    }
-    else if (nn % 2 == 0)
-    {
-        return 2.0*vrescale_gamdev(nn/2, rng);
-    }
-    else
+    r = 2.0*vrescale_gamdev(nn/2, step, count, seed1, seed2);
+
+    if (nn % 2 == 1)
     {
-        rr = gmx_rng_gaussian_real(rng);
-        return 2.0*vrescale_gamdev((nn-1)/2, rng) + rr*rr;
+        gauss = gaussian_count(step, count, seed1, seed2);
+
+        r += gauss*gauss;
     }
+
+    return r;
 }
 
 static real vrescale_resamplekin(real kk, real sigma, int ndeg, real taut,
-                                 gmx_rng_t rng)
+                                 gmx_int64_t step, gmx_int64_t seed)
 {
 /*
  * Generates a new value for the kinetic energy,
@@ -1596,7 +1463,11 @@ static real vrescale_resamplekin(real kk, real sigma, int ndeg, real taut,
  * ndeg:  number of degrees of freedom of the atoms to be thermalized
  * taut:  relaxation time of the thermostat, in units of 'how often this routine is called'
  */
-    real factor, rr;
+    /* rnd_count tracks the step-local state for the cycle random
+     * number generator.
+     */
+    gmx_int64_t rnd_count = 0;
+    real        factor, rr, ekin_new;
 
     if (taut > 0.1)
     {
@@ -1606,15 +1477,20 @@ static real vrescale_resamplekin(real kk, real sigma, int ndeg, real taut,
     {
         factor = 0.0;
     }
-    rr = gmx_rng_gaussian_real(rng);
-    return
+
+    rr = gaussian_count(step, &rnd_count, seed, RND_SEED_VRESCALE);
+
+    ekin_new =
         kk +
-        (1.0 - factor)*(sigma*(vrescale_sumnoises(ndeg-1, rng) + rr*rr)/ndeg - kk) +
+        (1.0 - factor)*(sigma*(vrescale_sumnoises(ndeg-1, step, &rnd_count, seed, RND_SEED_VRESCALE) + rr*rr)/ndeg - kk) +
         2.0*rr*sqrt(kk*sigma/ndeg*(1.0 - factor)*factor);
+
+    return ekin_new;
 }
 
-void vrescale_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt,
-                     double therm_integral[], gmx_rng_t rng)
+void vrescale_tcoupl(t_inputrec *ir, gmx_int64_t step,
+                     gmx_ekindata_t *ekind, real dt,
+                     double therm_integral[])
 {
     t_grpopts *opts;
     int        i;
@@ -1639,7 +1515,8 @@ void vrescale_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt,
             Ek_ref  = Ek_ref1*opts->nrdf[i];
 
             Ek_new  = vrescale_resamplekin(Ek, Ek_ref, opts->nrdf[i],
-                                           opts->tau_t[i]/dt, rng);
+                                           opts->tau_t[i]/dt,
+                                           ir->ld_seed, step);
 
             /* Analytically Ek_new>=0, but we check for rounding errors */
             if (Ek_new <= 0)
diff --git a/src/gromacs/mdlib/domdec.c b/src/gromacs/mdlib/domdec.c
index 864c054358..a344ba5bd7 100644
--- a/src/gromacs/mdlib/domdec.c
+++ b/src/gromacs/mdlib/domdec.c
@@ -1586,37 +1586,6 @@ void dd_collect_state(gmx_domdec_t *dd,
                 case estCGP:
                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
                     break;
-                case estLD_RNG:
-                    if (state->nrngi == 1)
-                    {
-                        if (DDMASTER(dd))
-                        {
-                            for (i = 0; i < state_local->nrng; i++)
-                            {
-                                state->ld_rng[i] = state_local->ld_rng[i];
-                            }
-                        }
-                    }
-                    else
-                    {
-                        dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
-                                  state_local->ld_rng, state->ld_rng);
-                    }
-                    break;
-                case estLD_RNGI:
-                    if (state->nrngi == 1)
-                    {
-                        if (DDMASTER(dd))
-                        {
-                            state->ld_rngi[0] = state_local->ld_rngi[0];
-                        }
-                    }
-                    else
-                    {
-                        dd_gather(dd, sizeof(state->ld_rngi[0]),
-                                  state_local->ld_rngi, state->ld_rngi);
-                    }
-                    break;
                 case estDISRE_INITF:
                 case estDISRE_RM3TAV:
                 case estORIRE_INITF:
@@ -1658,8 +1627,6 @@ static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
                 case estCGP:
                     srenew(state->cg_p, state->nalloc);
                     break;
-                case estLD_RNG:
-                case estLD_RNGI:
                 case estDISRE_INITF:
                 case estDISRE_RM3TAV:
                 case estORIRE_INITF:
@@ -1917,32 +1884,6 @@ static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
                 case estCGP:
                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
                     break;
-                case estLD_RNG:
-                    if (state->nrngi == 1)
-                    {
-                        dd_bcastc(dd,
-                                  state_local->nrng*sizeof(state_local->ld_rng[0]),
-                                  state->ld_rng, state_local->ld_rng);
-                    }
-                    else
-                    {
-                        dd_scatter(dd,
-                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
-                                   state->ld_rng, state_local->ld_rng);
-                    }
-                    break;
-                case estLD_RNGI:
-                    if (state->nrngi == 1)
-                    {
-                        dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
-                                  state->ld_rngi, state_local->ld_rngi);
-                    }
-                    else
-                    {
-                        dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
-                                   state->ld_rngi, state_local->ld_rngi);
-                    }
-                    break;
                 case estDISRE_INITF:
                 case estDISRE_RM3TAV:
                 case estORIRE_INITF:
diff --git a/src/gromacs/mdlib/domdec_top.c b/src/gromacs/mdlib/domdec_top.c
index 06ac1876e8..059528533b 100644
--- a/src/gromacs/mdlib/domdec_top.c
+++ b/src/gromacs/mdlib/domdec_top.c
@@ -47,7 +47,6 @@
 #include "vec.h"
 #include "pbc.h"
 #include "chargegroup.h"
-#include "gromacs/random/random.h"
 #include "gromacs/gmxlib/topsort.h"
 #include "mtop_util.h"
 #include "mshift.h"
@@ -1959,29 +1958,6 @@ void dd_init_local_state(gmx_domdec_t *dd,
 
     init_state(state_local, 0, buf[1], buf[2], buf[3], buf[4]);
     state_local->flags = buf[0];
-
-    /* With Langevin Dynamics we need to make proper storage space
-     * in the global and local state for the random numbers.
-     */
-    if (state_local->flags & (1<<estLD_RNG))
-    {
-        if (DDMASTER(dd) && state_global->nrngi > 1)
-        {
-            state_global->nrng = dd->nnodes*gmx_rng_n();
-            srenew(state_global->ld_rng, state_global->nrng);
-        }
-        state_local->nrng = gmx_rng_n();
-        snew(state_local->ld_rng, state_local->nrng);
-    }
-    if (state_local->flags & (1<<estLD_RNGI))
-    {
-        if (DDMASTER(dd) && state_global->nrngi > 1)
-        {
-            state_global->nrngi = dd->nnodes;
-            srenew(state_global->ld_rngi, state_global->nrngi);
-        }
-        snew(state_local->ld_rngi, 1);
-    }
 }
 
 static void check_link(t_blocka *link, int cg_gl, int cg_gl_j)
diff --git a/src/gromacs/mdlib/expanded.c b/src/gromacs/mdlib/expanded.c
index ce2aea3003..283dbbbf0c 100644
--- a/src/gromacs/mdlib/expanded.c
+++ b/src/gromacs/mdlib/expanded.c
@@ -87,9 +87,8 @@ static void init_df_history_weights(df_history_t *dfhist, t_expanded *expand, in
 
 /* Eventually should contain all the functions needed to initialize expanded ensemble
    before the md loop starts */
-extern void init_expanded_ensemble(gmx_bool bStateFromCP, t_inputrec *ir, gmx_rng_t *mcrng, df_history_t *dfhist)
+extern void init_expanded_ensemble(gmx_bool bStateFromCP, t_inputrec *ir, df_history_t *dfhist)
 {
-    *mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
     if (!bStateFromCP)
     {
         init_df_history_weights(dfhist, ir->expandedvals, ir->fepvals->n_lambda);
@@ -736,7 +735,8 @@ static gmx_bool UpdateWeights(int nlim, t_expanded *expand, df_history_t *dfhist
     return FALSE;
 }
 
-static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, int fep_state, real *weighted_lamee, double *p_k, gmx_rng_t rng)
+static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, int fep_state, real *weighted_lamee, double *p_k,
+                           gmx_int64_t seed, gmx_int64_t step)
 {
     /* Choose new lambda value, and update transition matrix */
 
@@ -782,6 +782,9 @@ static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, i
 
     for (i = 0; i < expand->lmc_repeats; i++)
     {
+        double rnd[2];
+
+        gmx_rng_cycle_2uniform(step, i, seed, RND_SEED_EXPANDED, rnd);
 
         for (ifep = 0; ifep < nlim; ifep++)
         {
@@ -823,7 +826,7 @@ static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, i
                     accept[ifep]  = 1.0;
                 }
                 /* Gibbs sampling */
-                r1 = gmx_rng_uniform_real(rng);
+                r1 = rnd[0];
                 for (lamnew = minfep; lamnew <= maxfep; lamnew++)
                 {
                     if (r1 <= p_k[lamnew])
@@ -864,7 +867,7 @@ static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, i
                         }
                     }
 
-                    r1 = gmx_rng_uniform_real(rng);
+                    r1 = rnd[0];
                     for (lamtrial = minfep; lamtrial <= maxfep; lamtrial++)
                     {
                         pnorm = p_k[lamtrial]/remainder[fep_state];
@@ -886,7 +889,7 @@ static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, i
                     {
                         tprob = trialprob;
                     }
-                    r2 = gmx_rng_uniform_real(rng);
+                    r2 = rnd[1];
                     if (r2 < tprob)
                     {
                         lamnew = lamtrial;
@@ -947,7 +950,7 @@ static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, i
         else if ((expand->elmcmove == elmcmoveMETROPOLIS) || (expand->elmcmove == elmcmoveBARKER))
         {
             /* use the metropolis sampler with trial +/- 1 */
-            r1 = gmx_rng_uniform_real(rng);
+            r1 = rnd[0];
             if (r1 < 0.5)
             {
                 if (fep_state == 0)
@@ -996,7 +999,7 @@ static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, i
                 accept[lamtrial]   = 1.0;
             }
 
-            r2 = gmx_rng_uniform_real(rng);
+            r2 = rnd[1];
             if (r2 < tprob)
             {
                 lamnew = lamtrial;
@@ -1196,19 +1199,9 @@ extern void PrintFreeEnergyInfoToFile(FILE *outfile, t_lambda *fep, t_expanded *
     }
 }
 
-extern void get_mc_state(gmx_rng_t rng, t_state *state)
-{
-    gmx_rng_get_state(rng, state->mc_rng, state->mc_rngi);
-}
-
-extern void set_mc_state(gmx_rng_t rng, t_state *state)
-{
-    gmx_rng_set_state(rng, state->mc_rng, state->mc_rngi[0]);
-}
-
 extern int ExpandedEnsembleDynamics(FILE *log, t_inputrec *ir, gmx_enerdata_t *enerd,
                                     t_state *state, t_extmass *MassQ, int fep_state, df_history_t *dfhist,
-                                    gmx_int64_t step, gmx_rng_t mcrng,
+                                    gmx_int64_t step,
                                     rvec *v, t_mdatoms *mdatoms)
 /* Note that the state variable is only needed for simulated tempering, not
    Hamiltonian expanded ensemble.  May be able to remove it after integrator refactoring. */
@@ -1321,7 +1314,8 @@ extern int ExpandedEnsembleDynamics(FILE *log, t_inputrec *ir, gmx_enerdata_t *e
         }
     }
 
-    lamnew = ChooseNewLambda(nlim, expand, dfhist, fep_state, weighted_lamee, p_k, mcrng);
+    lamnew = ChooseNewLambda(nlim, expand, dfhist, fep_state, weighted_lamee, p_k,
+                             ir->expandedvals->lmc_seed, step);
     /* if using simulated tempering, we need to adjust the temperatures */
     if (ir->bSimTemp && (lamnew != fep_state)) /* only need to change the temperatures if we change the state */
     {
diff --git a/src/gromacs/mdlib/init.c b/src/gromacs/mdlib/init.c
index 99105bd6b8..38bf4401f0 100644
--- a/src/gromacs/mdlib/init.c
+++ b/src/gromacs/mdlib/init.c
@@ -52,7 +52,6 @@
 #include "mdrun.h"
 #include "names.h"
 #include "calcgrid.h"
-#include "gromacs/random/random.h"
 #include "update.h"
 #include "mdebin.h"
 
@@ -61,7 +60,7 @@
 #define NOT_FINISHED(l1, l2) \
     printf("not finished yet: lines %d .. %d in %s\n", l1, l2, __FILE__)
 
-void set_state_entries(t_state *state, const t_inputrec *ir, int nnodes)
+void set_state_entries(t_state *state, const t_inputrec *ir)
 {
     int nnhpres;
 
@@ -109,31 +108,6 @@ void set_state_entries(t_state *state, const t_inputrec *ir, int nnodes)
             snew(state->cg_p, state->nalloc);
         }
     }
-    if (EI_SD(ir->eI) || ir->eI == eiBD || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc))
-    {
-        state->nrng  = gmx_rng_n();
-        state->nrngi = 1;
-        if (EI_SD(ir->eI) || ir->eI == eiBD || ETC_ANDERSEN(ir->etc))
-        {
-            /* This will be correct later with DD */
-            state->nrng  *= nnodes;
-            state->nrngi *= nnodes;
-        }
-        state->flags |= ((1<<estLD_RNG) | (1<<estLD_RNGI));
-        snew(state->ld_rng, state->nrng);
-        snew(state->ld_rngi, state->nrngi);
-    }
-    else
-    {
-        state->nrng = 0;
-    }
-
-    if (ir->bExpanded)
-    {
-        state->nmcrng  = gmx_rng_n();
-        snew(state->mc_rng, state->nmcrng);
-        snew(state->mc_rngi, 1);
-    }
 
     state->nnhpres = 0;
     if (ir->ePBC != epbcNONE)
@@ -190,10 +164,4 @@ void init_parallel(t_commrec *cr, t_inputrec *inputrec,
                    gmx_mtop_t *mtop)
 {
     bcast_ir_mtop(cr, inputrec, mtop);
-
-    if (inputrec->eI == eiBD || EI_SD(inputrec->eI) || ETC_ANDERSEN(inputrec->etc))
-    {
-        /* Make sure the random seeds are different on each node */
-        inputrec->ld_seed += cr->nodeid;
-    }
 }
diff --git a/src/gromacs/mdlib/minimize.c b/src/gromacs/mdlib/minimize.c
index 56aead1d94..26122e26d4 100644
--- a/src/gromacs/mdlib/minimize.c
+++ b/src/gromacs/mdlib/minimize.c
@@ -49,7 +49,6 @@
 #include "main.h"
 #include "force.h"
 #include "macros.h"
-#include "gromacs/random/random.h"
 #include "names.h"
 #include "gmx_fatal.h"
 #include "txtdump.h"
diff --git a/src/gromacs/mdlib/sim_util.c b/src/gromacs/mdlib/sim_util.c
index df0e1922c8..e154f2eba2 100644
--- a/src/gromacs/mdlib/sim_util.c
+++ b/src/gromacs/mdlib/sim_util.c
@@ -69,7 +69,6 @@
 #include "constr.h"
 #include "xvgr.h"
 #include "copyrite.h"
-#include "gromacs/random/random.h"
 #include "domdec.h"
 #include "genborn.h"
 #include "nbnxn_atomdata.h"
diff --git a/src/gromacs/mdlib/tpi.c b/src/gromacs/mdlib/tpi.c
index 3f944c9e4c..3d6d5a0894 100644
--- a/src/gromacs/mdlib/tpi.c
+++ b/src/gromacs/mdlib/tpi.c
@@ -142,14 +142,18 @@ double do_tpi(FILE *fplog, t_commrec *cr,
     double          embU, sum_embU, *sum_UgembU, V, V_all, VembU_all;
     t_trxstatus    *status;
     t_trxframe      rerun_fr;
-    gmx_bool        bDispCorr, bCharge, bRFExcl, bNotLastFrame, bStateChanged, bNS, bOurStep;
+    gmx_bool        bDispCorr, bCharge, bRFExcl, bNotLastFrame, bStateChanged, bNS;
     tensor          force_vir, shake_vir, vir, pres;
     int             cg_tp, a_tp0, a_tp1, ngid, gid_tp, nener, e;
     rvec           *x_mol;
     rvec            mu_tot, x_init, dx, x_tp;
-    int             nnodes, frame, nsteps, step;
+    int             nnodes, frame;
+    gmx_int64_t     frame_step_prev, frame_step;
+    gmx_int64_t     nsteps, stepblocksize = 0, step;
+    gmx_int64_t     rnd_count_stride, rnd_count;
+    gmx_int64_t     seed;
+    double          rnd[4];
     int             i, start, end;
-    gmx_rng_t       tpi_rand;
     FILE           *fp_tpi = NULL;
     char           *ptr, *dump_pdb, **leg, str[STRLEN], str2[STRLEN];
     double          dbl, dump_ener;
@@ -310,7 +314,7 @@ double do_tpi(FILE *fplog, t_commrec *cr,
                 a_tp1-a_tp0, bCharge ? "with" : "without");
 
         fprintf(fplog, "\nWill insert %d times in each frame of %s\n",
-                nsteps, opt2fn("-rerun", nfile, fnm));
+                (int)nsteps, opt2fn("-rerun", nfile, fnm));
     }
 
     if (!bCavity)
@@ -356,8 +360,17 @@ double do_tpi(FILE *fplog, t_commrec *cr,
     }
     snew(sum_UgembU, nener);
 
-    /* Initialize random generator */
-    tpi_rand = gmx_rng_init(inputrec->ld_seed);
+    /* Copy the random seed set by the user */
+    seed = inputrec->ld_seed;
+    /* We use the frame step number as one random counter.
+     * The second counter use the insertion (step) count. But we
+     * need multiple random numbers per insertion. This number is
+     * not fixed, since we generate random locations in a sphere
+     * by putting locations in a cube and some of these fail.
+     * A count of 20 is already extremely unlikely, so 10000 is
+     * a safe margin for random numbers per insertion.
+     */
+    rnd_count_stride = 10000;
 
     if (MASTER(cr))
     {
@@ -422,6 +435,9 @@ double do_tpi(FILE *fplog, t_commrec *cr,
     nbin    = 10;
     snew(bin, nbin);
 
+    /* Avoid frame step numbers <= -1 */
+    frame_step_prev = -1;
+
     bNotLastFrame = read_first_frame(oenv, &status, opt2fn("-rerun", nfile, fnm),
                                      &rerun_fr, TRX_NEED_X);
     frame = 0;
@@ -438,6 +454,18 @@ double do_tpi(FILE *fplog, t_commrec *cr,
 
     refvolshift = log(det(rerun_fr.box));
 
+    switch (inputrec->eI)
+    {
+        case eiTPI:
+            stepblocksize = inputrec->nstlist;
+            break;
+        case eiTPIC:
+            stepblocksize = 1;
+            break;
+        default:
+            gmx_fatal(FARGS, "Unknown integrator %s", ei_names[inputrec->eI]);
+    }
+
 #ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
     /* Make sure we don't detect SSE overflow generated before this point */
     gmx_mm_check_and_reset_overflow();
@@ -445,6 +473,18 @@ double do_tpi(FILE *fplog, t_commrec *cr,
 
     while (bNotLastFrame)
     {
+        frame_step      = rerun_fr.step;
+        if (frame_step <= frame_step_prev)
+        {
+            /* We don't have step number in the trajectory file,
+             * or we have constant or decreasing step numbers.
+             * Ensure we have increasing step numbers, since we use
+             * the step numbers as a counter for random numbers.
+             */
+            frame_step  = frame_step_prev + 1;
+        }
+        frame_step_prev = frame_step;
+
         lambda = rerun_fr.lambda;
         t      = rerun_fr.time;
 
@@ -466,11 +506,18 @@ double do_tpi(FILE *fplog, t_commrec *cr,
 
         bStateChanged = TRUE;
         bNS           = TRUE;
-        for (step = 0; step < nsteps; step++)
+
+        step = cr->nodeid*stepblocksize;
+        while (step < nsteps)
         {
-            /* In parallel all nodes generate all random configurations.
-             * In that way the result is identical to a single cpu tpi run.
+            /* Initialize the second counter for random numbers using
+             * the insertion step index. This ensures that we get
+             * the same random numbers independently of how many
+             * MPI ranks we use. Also for the same seed, we get
+             * the same initial random sequence for different nsteps.
              */
+            rnd_count = step*rnd_count_stride;
+
             if (!bCavity)
             {
                 /* Random insertion in the whole volume */
@@ -478,9 +525,12 @@ double do_tpi(FILE *fplog, t_commrec *cr,
                 if (bNS)
                 {
                     /* Generate a random position in the box */
-                    x_init[XX] = gmx_rng_uniform_real(tpi_rand)*state->box[XX][XX];
-                    x_init[YY] = gmx_rng_uniform_real(tpi_rand)*state->box[YY][YY];
-                    x_init[ZZ] = gmx_rng_uniform_real(tpi_rand)*state->box[ZZ][ZZ];
+                    gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd);
+                    gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2);
+                    for (d = 0; d < DIM; d++)
+                    {
+                        x_init[d] = rnd[d]*state->box[d][d];
+                    }
                 }
                 if (inputrec->nstlist == 1)
                 {
@@ -491,9 +541,12 @@ double do_tpi(FILE *fplog, t_commrec *cr,
                     /* Generate coordinates within |dx|=drmax of x_init */
                     do
                     {
-                        dx[XX] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax;
-                        dx[YY] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax;
-                        dx[ZZ] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax;
+                        gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd);
+                        gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2);
+                        for (d = 0; d < DIM; d++)
+                        {
+                            dx[d] = (2*rnd[d] - 1)*drmax;
+                        }
                     }
                     while (norm2(dx) > drmax*drmax);
                     rvec_add(x_init, dx, x_tp);
@@ -534,9 +587,12 @@ double do_tpi(FILE *fplog, t_commrec *cr,
                 /* Generate coordinates within |dx|=drmax of x_init */
                 do
                 {
-                    dx[XX] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax;
-                    dx[YY] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax;
-                    dx[ZZ] = (2*gmx_rng_uniform_real(tpi_rand) - 1)*drmax;
+                    gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd);
+                    gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2);
+                    for (d = 0; d < DIM; d++)
+                    {
+                        dx[d] = (2*rnd[d] - 1)*drmax;
+                    }
                 }
                 while (norm2(dx) > drmax*drmax);
                 rvec_add(x_init, dx, x_tp);
@@ -555,10 +611,12 @@ double do_tpi(FILE *fplog, t_commrec *cr,
                     copy_rvec(x_mol[i-a_tp0], state->x[i]);
                 }
                 /* Rotate the molecule randomly */
+                gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd);
+                gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2);
                 rotate_conf(a_tp1-a_tp0, state->x+a_tp0, NULL,
-                            2*M_PI*gmx_rng_uniform_real(tpi_rand),
-                            2*M_PI*gmx_rng_uniform_real(tpi_rand),
-                            2*M_PI*gmx_rng_uniform_real(tpi_rand));
+                            2*M_PI*rnd[0],
+                            2*M_PI*rnd[1],
+                            2*M_PI*rnd[2]);
                 /* Shift to the insertion location */
                 for (i = a_tp0; i < a_tp1; i++)
                 {
@@ -566,188 +624,176 @@ double do_tpi(FILE *fplog, t_commrec *cr,
                 }
             }
 
-            /* Check if this insertion belongs to this node */
-            bOurStep = TRUE;
-            if (PAR(cr))
+            /* Clear some matrix variables  */
+            clear_mat(force_vir);
+            clear_mat(shake_vir);
+            clear_mat(vir);
+            clear_mat(pres);
+
+            /* Set the charge group center of mass of the test particle */
+            copy_rvec(x_init, fr->cg_cm[top->cgs.nr-1]);
+
+            /* Calc energy (no forces) on new positions.
+             * Since we only need the intermolecular energy
+             * and the RF exclusion terms of the inserted molecule occur
+             * within a single charge group we can pass NULL for the graph.
+             * This also avoids shifts that would move charge groups
+             * out of the box.
+             *
+             * Some checks above ensure than we can not have
+             * twin-range interactions together with nstlist > 1,
+             * therefore we do not need to remember the LR energies.
+             */
+            /* Make do_force do a single node force calculation */
+            cr->nnodes = 1;
+            do_force(fplog, cr, inputrec,
+                     step, nrnb, wcycle, top, &top_global->groups,
+                     state->box, state->x, &state->hist,
+                     f, force_vir, mdatoms, enerd, fcd,
+                     state->lambda,
+                     NULL, fr, NULL, mu_tot, t, NULL, NULL, FALSE,
+                     GMX_FORCE_NONBONDED | GMX_FORCE_ENERGY |
+                     (bNS ? GMX_FORCE_DYNAMICBOX | GMX_FORCE_NS | GMX_FORCE_DO_LR : 0) |
+                     (bStateChanged ? GMX_FORCE_STATECHANGED : 0));
+            cr->nnodes    = nnodes;
+            bStateChanged = FALSE;
+            bNS           = FALSE;
+
+            /* Calculate long range corrections to pressure and energy */
+            calc_dispcorr(fplog, inputrec, fr, step, top_global->natoms, state->box,
+                          lambda, pres, vir, &prescorr, &enercorr, &dvdlcorr);
+            /* figure out how to rearrange the next 4 lines MRS 8/4/2009 */
+            enerd->term[F_DISPCORR]  = enercorr;
+            enerd->term[F_EPOT]     += enercorr;
+            enerd->term[F_PRES]     += prescorr;
+            enerd->term[F_DVDL_VDW] += dvdlcorr;
+
+            epot               = enerd->term[F_EPOT];
+            bEnergyOutOfBounds = FALSE;
+#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
+            /* With SSE the energy can overflow, check for this */
+            if (gmx_mm_check_and_reset_overflow())
             {
-                switch (inputrec->eI)
+                if (debug)
                 {
-                    case eiTPI:
-                        bOurStep = ((step / inputrec->nstlist) % nnodes == cr->nodeid);
-                        break;
-                    case eiTPIC:
-                        bOurStep = (step % nnodes == cr->nodeid);
-                        break;
-                    default:
-                        gmx_fatal(FARGS, "Unknown integrator %s", ei_names[inputrec->eI]);
+                    fprintf(debug, "Found an SSE overflow, assuming the energy is out of bounds\n");
                 }
+                bEnergyOutOfBounds = TRUE;
             }
-            if (bOurStep)
-            {
-                /* Clear some matrix variables  */
-                clear_mat(force_vir);
-                clear_mat(shake_vir);
-                clear_mat(vir);
-                clear_mat(pres);
-
-                /* Set the charge group center of mass of the test particle */
-                copy_rvec(x_init, fr->cg_cm[top->cgs.nr-1]);
-
-                /* Calc energy (no forces) on new positions.
-                 * Since we only need the intermolecular energy
-                 * and the RF exclusion terms of the inserted molecule occur
-                 * within a single charge group we can pass NULL for the graph.
-                 * This also avoids shifts that would move charge groups
-                 * out of the box.
-                 *
-                 * Some checks above ensure than we can not have
-                 * twin-range interactions together with nstlist > 1,
-                 * therefore we do not need to remember the LR energies.
-                 */
-                /* Make do_force do a single node force calculation */
-                cr->nnodes = 1;
-                do_force(fplog, cr, inputrec,
-                         step, nrnb, wcycle, top, &top_global->groups,
-                         state->box, state->x, &state->hist,
-                         f, force_vir, mdatoms, enerd, fcd,
-                         state->lambda,
-                         NULL, fr, NULL, mu_tot, t, NULL, NULL, FALSE,
-                         GMX_FORCE_NONBONDED | GMX_FORCE_ENERGY |
-                         (bNS ? GMX_FORCE_DYNAMICBOX | GMX_FORCE_NS | GMX_FORCE_DO_LR : 0) |
-                         (bStateChanged ? GMX_FORCE_STATECHANGED : 0));
-                cr->nnodes    = nnodes;
-                bStateChanged = FALSE;
-                bNS           = FALSE;
-
-                /* Calculate long range corrections to pressure and energy */
-                calc_dispcorr(fplog, inputrec, fr, step, top_global->natoms, state->box,
-                              lambda, pres, vir, &prescorr, &enercorr, &dvdlcorr);
-                /* figure out how to rearrange the next 4 lines MRS 8/4/2009 */
-                enerd->term[F_DISPCORR]  = enercorr;
-                enerd->term[F_EPOT]     += enercorr;
-                enerd->term[F_PRES]     += prescorr;
-                enerd->term[F_DVDL_VDW] += dvdlcorr;
-
-                epot               = enerd->term[F_EPOT];
-                bEnergyOutOfBounds = FALSE;
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-                /* With SSE the energy can overflow, check for this */
-                if (gmx_mm_check_and_reset_overflow())
-                {
-                    if (debug)
-                    {
-                        fprintf(debug, "Found an SSE overflow, assuming the energy is out of bounds\n");
-                    }
-                    bEnergyOutOfBounds = TRUE;
-                }
 #endif
-                /* If the compiler doesn't optimize this check away
-                 * we catch the NAN energies.
-                 * The epot>GMX_REAL_MAX check catches inf values,
-                 * which should nicely result in embU=0 through the exp below,
-                 * but it does not hurt to check anyhow.
-                 */
-                /* Non-bonded Interaction usually diverge at r=0.
-                 * With tabulated interaction functions the first few entries
-                 * should be capped in a consistent fashion between
-                 * repulsion, dispersion and Coulomb to avoid accidental
-                 * negative values in the total energy.
-                 * The table generation code in tables.c does this.
-                 * With user tbales the user should take care of this.
-                 */
-                if (epot != epot || epot > GMX_REAL_MAX)
+            /* If the compiler doesn't optimize this check away
+             * we catch the NAN energies.
+             * The epot>GMX_REAL_MAX check catches inf values,
+             * which should nicely result in embU=0 through the exp below,
+             * but it does not hurt to check anyhow.
+             */
+            /* Non-bonded Interaction usually diverge at r=0.
+             * With tabulated interaction functions the first few entries
+             * should be capped in a consistent fashion between
+             * repulsion, dispersion and Coulomb to avoid accidental
+             * negative values in the total energy.
+             * The table generation code in tables.c does this.
+             * With user tbales the user should take care of this.
+             */
+            if (epot != epot || epot > GMX_REAL_MAX)
+            {
+                bEnergyOutOfBounds = TRUE;
+            }
+            if (bEnergyOutOfBounds)
+            {
+                if (debug)
                 {
-                    bEnergyOutOfBounds = TRUE;
+                    fprintf(debug, "\n  time %.3f, step %d: non-finite energy %f, using exp(-bU)=0\n", t, (int)step, epot);
                 }
-                if (bEnergyOutOfBounds)
+                embU = 0;
+            }
+            else
+            {
+                embU      = exp(-beta*epot);
+                sum_embU += embU;
+                /* Determine the weighted energy contributions of each energy group */
+                e                = 0;
+                sum_UgembU[e++] += epot*embU;
+                if (fr->bBHAM)
                 {
-                    if (debug)
+                    for (i = 0; i < ngid; i++)
                     {
-                        fprintf(debug, "\n  time %.3f, step %d: non-finite energy %f, using exp(-bU)=0\n", t, step, epot);
+                        sum_UgembU[e++] +=
+                            (enerd->grpp.ener[egBHAMSR][GID(i, gid_tp, ngid)] +
+                             enerd->grpp.ener[egBHAMLR][GID(i, gid_tp, ngid)])*embU;
                     }
-                    embU = 0;
                 }
                 else
                 {
-                    embU      = exp(-beta*epot);
-                    sum_embU += embU;
-                    /* Determine the weighted energy contributions of each energy group */
-                    e                = 0;
-                    sum_UgembU[e++] += epot*embU;
-                    if (fr->bBHAM)
-                    {
-                        for (i = 0; i < ngid; i++)
-                        {
-                            sum_UgembU[e++] +=
-                                (enerd->grpp.ener[egBHAMSR][GID(i, gid_tp, ngid)] +
-                                 enerd->grpp.ener[egBHAMLR][GID(i, gid_tp, ngid)])*embU;
-                        }
-                    }
-                    else
-                    {
-                        for (i = 0; i < ngid; i++)
-                        {
-                            sum_UgembU[e++] +=
-                                (enerd->grpp.ener[egLJSR][GID(i, gid_tp, ngid)] +
-                                 enerd->grpp.ener[egLJLR][GID(i, gid_tp, ngid)])*embU;
-                        }
-                    }
-                    if (bDispCorr)
+                    for (i = 0; i < ngid; i++)
                     {
-                        sum_UgembU[e++] += enerd->term[F_DISPCORR]*embU;
-                    }
-                    if (bCharge)
-                    {
-                        for (i = 0; i < ngid; i++)
-                        {
-                            sum_UgembU[e++] +=
-                                (enerd->grpp.ener[egCOULSR][GID(i, gid_tp, ngid)] +
-                                 enerd->grpp.ener[egCOULLR][GID(i, gid_tp, ngid)])*embU;
-                        }
-                        if (bRFExcl)
-                        {
-                            sum_UgembU[e++] += enerd->term[F_RF_EXCL]*embU;
-                        }
-                        if (EEL_FULL(fr->eeltype))
-                        {
-                            sum_UgembU[e++] += enerd->term[F_COUL_RECIP]*embU;
-                        }
+                        sum_UgembU[e++] +=
+                            (enerd->grpp.ener[egLJSR][GID(i, gid_tp, ngid)] +
+                             enerd->grpp.ener[egLJLR][GID(i, gid_tp, ngid)])*embU;
                     }
                 }
-
-                if (embU == 0 || beta*epot > bU_bin_limit)
+                if (bDispCorr)
                 {
-                    bin[0]++;
+                    sum_UgembU[e++] += enerd->term[F_DISPCORR]*embU;
                 }
-                else
+                if (bCharge)
                 {
-                    i = (int)((bU_logV_bin_limit
-                               - (beta*epot - logV + refvolshift))*invbinw
-                              + 0.5);
-                    if (i < 0)
+                    for (i = 0; i < ngid; i++)
+                    {
+                        sum_UgembU[e++] +=
+                            (enerd->grpp.ener[egCOULSR][GID(i, gid_tp, ngid)] +
+                             enerd->grpp.ener[egCOULLR][GID(i, gid_tp, ngid)])*embU;
+                    }
+                    if (bRFExcl)
                     {
-                        i = 0;
+                        sum_UgembU[e++] += enerd->term[F_RF_EXCL]*embU;
                     }
-                    if (i >= nbin)
+                    if (EEL_FULL(fr->eeltype))
                     {
-                        realloc_bins(&bin, &nbin, i+10);
+                        sum_UgembU[e++] += enerd->term[F_COUL_RECIP]*embU;
                     }
-                    bin[i]++;
                 }
+            }
 
-                if (debug)
+            if (embU == 0 || beta*epot > bU_bin_limit)
+            {
+                bin[0]++;
+            }
+            else
+            {
+                i = (int)((bU_logV_bin_limit
+                           - (beta*epot - logV + refvolshift))*invbinw
+                          + 0.5);
+                if (i < 0)
                 {
-                    fprintf(debug, "TPI %7d %12.5e %12.5f %12.5f %12.5f\n",
-                            step, epot, x_tp[XX], x_tp[YY], x_tp[ZZ]);
+                    i = 0;
                 }
-
-                if (dump_pdb && epot <= dump_ener)
+                if (i >= nbin)
                 {
-                    sprintf(str, "t%g_step%d.pdb", t, step);
-                    sprintf(str2, "t: %f step %d ener: %f", t, step, epot);
-                    write_sto_conf_mtop(str, str2, top_global, state->x, state->v,
-                                        inputrec->ePBC, state->box);
+                    realloc_bins(&bin, &nbin, i+10);
                 }
+                bin[i]++;
+            }
+
+            if (debug)
+            {
+                fprintf(debug, "TPI %7d %12.5e %12.5f %12.5f %12.5f\n",
+                        (int)step, epot, x_tp[XX], x_tp[YY], x_tp[ZZ]);
+            }
+
+            if (dump_pdb && epot <= dump_ener)
+            {
+                sprintf(str, "t%g_step%d.pdb", t, (int)step);
+                sprintf(str2, "t: %f step %d ener: %f", t, (int)step, epot);
+                write_sto_conf_mtop(str, str2, top_global, state->x, state->v,
+                                    inputrec->ePBC, state->box);
+            }
+
+            step++;
+            if ((step/stepblocksize) % cr->nnodes != cr->nodeid)
+            {
+                /* Skip all steps assigned to the other MPI ranks */
+                step += (cr->nnodes - 1)*stepblocksize;
             }
         }
 
diff --git a/src/gromacs/mdlib/update.c b/src/gromacs/mdlib/update.c
index 2d904cba90..07dff61a03 100644
--- a/src/gromacs/mdlib/update.c
+++ b/src/gromacs/mdlib/update.c
@@ -91,12 +91,6 @@ typedef struct {
 } gmx_sd_sigma_t;
 
 typedef struct {
-    /* The random state for ngaussrand threads.
-     * Normal thermostats need just 1 random number generator,
-     * but SD and BD with OpenMP parallelization need 1 for each thread.
-     */
-    int             ngaussrand;
-    gmx_rng_t      *gaussrand;
     /* BD stuff */
     real           *bd_rf;
     /* SD stuff */
@@ -116,11 +110,6 @@ typedef struct gmx_update
     rvec         *xp;
     int           xp_nalloc;
 
-    /* variable size arrays for andersen */
-    gmx_bool *randatom;
-    int      *randatom_list;
-    gmx_bool  randatom_list_init;
-
     /* Variables for the deform algorithm */
     gmx_int64_t     deformref_step;
     matrix          deformref_box;
@@ -476,43 +465,7 @@ static void do_update_visc(int start, int nrend, double dt,
     }
 }
 
-/* Allocates and initializes sd->gaussrand[i] for i=1, i<sd->ngaussrand,
- * Using seeds generated from sd->gaussrand[0].
- */
-static void init_multiple_gaussrand(gmx_stochd_t *sd)
-{
-    int           ngr, i;
-    unsigned int *seed;
-
-    ngr = sd->ngaussrand;
-    snew(seed, ngr);
-
-    for (i = 1; i < ngr; i++)
-    {
-        seed[i] = gmx_rng_uniform_uint32(sd->gaussrand[0]);
-    }
-
-    if (ngr != gmx_omp_nthreads_get(emntUpdate))
-    {
-        gmx_incons("The number of Gaussian number generators should be equal to gmx_omp_nthreads_get(emntUpdate)");
-    }
-
-#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
-    {
-        int th;
-
-        th = gmx_omp_get_thread_num();
-        if (th > 0)
-        {
-            /* Initialize on each thread to get memory allocated thread-local */
-            sd->gaussrand[th] = gmx_rng_init(seed[th]);
-        }
-    }
-
-    sfree(seed);
-}
-
-static gmx_stochd_t *init_stochd(t_inputrec *ir, int nthreads)
+static gmx_stochd_t *init_stochd(t_inputrec *ir)
 {
     gmx_stochd_t   *sd;
     gmx_sd_const_t *sdc;
@@ -521,30 +474,6 @@ static gmx_stochd_t *init_stochd(t_inputrec *ir, int nthreads)
 
     snew(sd, 1);
 
-    /* Initiate random number generator for langevin type dynamics,
-     * for BD, SD or velocity rescaling temperature coupling.
-     */
-    if (ir->eI == eiBD || EI_SD(ir->eI))
-    {
-        sd->ngaussrand = nthreads;
-    }
-    else
-    {
-        sd->ngaussrand = 1;
-    }
-    snew(sd->gaussrand, sd->ngaussrand);
-
-    /* Initialize the first random generator */
-    sd->gaussrand[0] = gmx_rng_init(ir->ld_seed);
-
-    if (sd->ngaussrand > 1)
-    {
-        /* Initialize the rest of the random number generators,
-         * using the first one to generate seeds.
-         */
-        init_multiple_gaussrand(sd);
-    }
-
     ngtc = ir->opts.ngtc;
 
     if (ir->eI == eiBD)
@@ -628,42 +557,6 @@ static gmx_stochd_t *init_stochd(t_inputrec *ir, int nthreads)
     return sd;
 }
 
-void get_stochd_state(gmx_update_t upd, t_state *state)
-{
-    /* Note that we only get the state of the first random generator,
-     * even if there are multiple. This avoids repetition.
-     */
-    gmx_rng_get_state(upd->sd->gaussrand[0], state->ld_rng, state->ld_rngi);
-}
-
-void set_stochd_state(gmx_update_t upd, t_state *state)
-{
-    gmx_stochd_t *sd;
-    int           i;
-
-    sd = upd->sd;
-
-    gmx_rng_set_state(sd->gaussrand[0], state->ld_rng, state->ld_rngi[0]);
-
-    if (sd->ngaussrand > 1)
-    {
-        /* We only end up here with SD or BD with OpenMP.
-         * Destroy and reinitialize the rest of the random number generators,
-         * using seeds generated from the first one.
-         * Although this doesn't recover the previous state,
-         * it at least avoids repetition, which is most important.
-         * Exaclty restoring states with all MPI+OpenMP setups is difficult
-         * and as the integrator is random to start with, doesn't gain us much.
-         */
-        for (i = 1; i < sd->ngaussrand; i++)
-        {
-            gmx_rng_destroy(sd->gaussrand[i]);
-        }
-
-        init_multiple_gaussrand(sd);
-    }
-}
-
 gmx_update_t init_update(t_inputrec *ir)
 {
     t_gmx_update *upd;
@@ -672,27 +565,24 @@ gmx_update_t init_update(t_inputrec *ir)
 
     if (ir->eI == eiBD || EI_SD(ir->eI) || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc))
     {
-        upd->sd = init_stochd(ir, gmx_omp_nthreads_get(emntUpdate));
+        upd->sd    = init_stochd(ir);
     }
 
-    upd->xp                 = NULL;
-    upd->xp_nalloc          = 0;
-    upd->randatom           = NULL;
-    upd->randatom_list      = NULL;
-    upd->randatom_list_init = FALSE; /* we have not yet cleared the data structure at this point */
+    upd->xp        = NULL;
+    upd->xp_nalloc = 0;
 
     return upd;
 }
 
 static void do_update_sd1(gmx_stochd_t *sd,
-                          gmx_rng_t gaussrand,
                           int start, int nrend, double dt,
                           rvec accel[], ivec nFreeze[],
                           real invmass[], unsigned short ptype[],
                           unsigned short cFREEZE[], unsigned short cACC[],
                           unsigned short cTC[],
                           rvec x[], rvec xprime[], rvec v[], rvec f[],
-                          int ngtc, real tau_t[], real ref_t[])
+                          int ngtc, real tau_t[], real ref_t[],
+                          gmx_int64_t step, int seed, int* gatindex)
 {
     gmx_sd_const_t *sdc;
     gmx_sd_sigma_t *sig;
@@ -713,6 +603,8 @@ static void do_update_sd1(gmx_stochd_t *sd,
 
     for (n = start; n < nrend; n++)
     {
+        real rnd[3];
+        int  ng  = gatindex ? gatindex[n] : n;
         ism = sqrt(invmass[n]);
         if (cFREEZE)
         {
@@ -727,11 +619,12 @@ static void do_update_sd1(gmx_stochd_t *sd,
             gt  = cTC[n];
         }
 
+        gmx_rng_cycle_3gaussian_table(step, ng, seed, RND_SEED_UPDATE, rnd);
         for (d = 0; d < DIM; d++)
         {
             if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
             {
-                sd_V = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand);
+                sd_V = ism*sig[gt].V*rnd[d];
 
                 v[n][d] = v[n][d]*sdc[gt].em
                     + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em)
@@ -783,7 +676,6 @@ static void do_update_sd2_Tconsts(gmx_stochd_t *sd,
 }
 
 static void do_update_sd2(gmx_stochd_t *sd,
-                          gmx_rng_t gaussrand,
                           gmx_bool bInitStep,
                           int start, int nrend,
                           rvec accel[], ivec nFreeze[],
@@ -793,7 +685,8 @@ static void do_update_sd2(gmx_stochd_t *sd,
                           rvec x[], rvec xprime[], rvec v[], rvec f[],
                           rvec sd_X[],
                           const real tau_t[],
-                          gmx_bool bFirstHalf)
+                          gmx_bool bFirstHalf, gmx_int64_t step, int seed,
+                          int* gatindex)
 {
     gmx_sd_const_t *sdc;
     gmx_sd_sigma_t *sig;
@@ -805,7 +698,7 @@ static void do_update_sd2(gmx_stochd_t *sd,
     int    gf = 0, ga = 0, gt = 0;
     real   vn = 0, Vmh, Xmh;
     real   ism;
-    int    n, d;
+    int    n, d, ng;
 
     sdc  = sd->sdc;
     sig  = sd->sdsig;
@@ -813,6 +706,8 @@ static void do_update_sd2(gmx_stochd_t *sd,
 
     for (n = start; n < nrend; n++)
     {
+        real rnd[6], rndi[3];
+        ng  = gatindex ? gatindex[n] : n;
         ism = sqrt(invmass[n]);
         if (cFREEZE)
         {
@@ -827,6 +722,11 @@ static void do_update_sd2(gmx_stochd_t *sd,
             gt  = cTC[n];
         }
 
+        gmx_rng_cycle_6gaussian_table(step*2+(bFirstHalf ? 1 : 2), ng, seed, RND_SEED_UPDATE, rnd);
+        if (bInitStep)
+        {
+            gmx_rng_cycle_3gaussian_table(step*2, ng, seed, RND_SEED_UPDATE, rndi);
+        }
         for (d = 0; d < DIM; d++)
         {
             if (bFirstHalf)
@@ -839,11 +739,11 @@ static void do_update_sd2(gmx_stochd_t *sd,
                 {
                     if (bInitStep)
                     {
-                        sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand);
+                        sd_X[n][d] = ism*sig[gt].X*rndi[d];
                     }
                     Vmh = sd_X[n][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c)
-                        + ism*sig[gt].Yv*gmx_rng_gaussian_table(gaussrand);
-                    sd_V[n][d] = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand);
+                        + ism*sig[gt].Yv*rnd[d*2];
+                    sd_V[n][d] = ism*sig[gt].V*rnd[d*2+1];
 
                     v[n][d] = vn*sdc[gt].em
                         + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em)
@@ -853,7 +753,6 @@ static void do_update_sd2(gmx_stochd_t *sd,
                 }
                 else
                 {
-
                     /* Correct the velocities for the constraints.
                      * This operation introduces some inaccuracy,
                      * since the velocity is determined from differences in coordinates.
@@ -862,8 +761,8 @@ static void do_update_sd2(gmx_stochd_t *sd,
                         (xprime[n][d] - x[n][d])/(tau_t[gt]*(sdc[gt].eph - sdc[gt].emh));
 
                     Xmh = sd_V[n][d]*tau_t[gt]*sdc[gt].d/(sdc[gt].em-1)
-                        + ism*sig[gt].Yx*gmx_rng_gaussian_table(gaussrand);
-                    sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand);
+                        + ism*sig[gt].Yx*rnd[d*2];
+                    sd_X[n][d] = ism*sig[gt].X*rnd[d*2+1];
 
                     xprime[n][d] += sd_X[n][d] - Xmh;
 
@@ -910,7 +809,8 @@ static void do_update_bd(int start, int nrend, double dt,
                          unsigned short cFREEZE[], unsigned short cTC[],
                          rvec x[], rvec xprime[], rvec v[],
                          rvec f[], real friction_coefficient,
-                         real *rf, gmx_rng_t gaussrand)
+                         real *rf, gmx_int64_t step, int seed,
+                         int* gatindex)
 {
     /* note -- these appear to be full step velocities . . .  */
     int    gf = 0, gt = 0;
@@ -925,6 +825,9 @@ static void do_update_bd(int start, int nrend, double dt,
 
     for (n = start; (n < nrend); n++)
     {
+        real rnd[3];
+        int  ng  = gatindex ? gatindex[n] : n;
+
         if (cFREEZE)
         {
             gf = cFREEZE[n];
@@ -933,19 +836,20 @@ static void do_update_bd(int start, int nrend, double dt,
         {
             gt = cTC[n];
         }
+        gmx_rng_cycle_3gaussian_table(step, ng, seed, RND_SEED_UPDATE, rnd);
         for (d = 0; (d < DIM); d++)
         {
             if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
             {
                 if (friction_coefficient != 0)
                 {
-                    vn = invfr*f[n][d] + rf[gt]*gmx_rng_gaussian_table(gaussrand);
+                    vn = invfr*f[n][d] + rf[gt]*rnd[d];
                 }
                 else
                 {
                     /* NOTE: invmass = 2/(mass*friction_constant*dt) */
                     vn = 0.5*invmass[n]*f[n][d]*dt
-                        + sqrt(0.5*invmass[n])*rf[gt]*gmx_rng_gaussian_table(gaussrand);
+                        + sqrt(0.5*invmass[n])*rf[gt]*rnd[d];
                 }
 
                 v[n][d]      = vn;
@@ -1376,7 +1280,6 @@ void update_tcouple(gmx_int64_t       step,
                     t_inputrec       *inputrec,
                     t_state          *state,
                     gmx_ekindata_t   *ekind,
-                    gmx_update_t      upd,
                     t_extmass        *MassQ,
                     t_mdatoms        *md)
 
@@ -1421,8 +1324,8 @@ void update_tcouple(gmx_int64_t       step,
                                   state->nosehoover_xi, state->nosehoover_vxi, MassQ);
                 break;
             case etcVRESCALE:
-                vrescale_tcoupl(inputrec, ekind, dttc,
-                                state->therm_integral, upd->sd->gaussrand[0]);
+                vrescale_tcoupl(inputrec, step, ekind, dttc,
+                                state->therm_integral);
                 break;
         }
         /* rescale in place here */
@@ -1652,14 +1555,15 @@ void update_constraints(FILE             *fplog,
             end_th   = start + ((nrend-start)*(th+1))/nth;
 
             /* The second part of the SD integration */
-            do_update_sd2(upd->sd, upd->sd->gaussrand[th],
+            do_update_sd2(upd->sd,
                           FALSE, start_th, end_th,
                           inputrec->opts.acc, inputrec->opts.nFreeze,
                           md->invmass, md->ptype,
                           md->cFREEZE, md->cACC, md->cTC,
                           state->x, xprime, state->v, force, state->sd_X,
                           inputrec->opts.tau_t,
-                          FALSE);
+                          FALSE, step, inputrec->ld_seed,
+                          DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
         }
         inc_nrnb(nrnb, eNR_UPDATE, homenr);
 
@@ -1951,26 +1855,28 @@ void update_coords(FILE             *fplog,
                 }
                 break;
             case (eiSD1):
-                do_update_sd1(upd->sd, upd->sd->gaussrand[th],
+                do_update_sd1(upd->sd,
                               start_th, end_th, dt,
                               inputrec->opts.acc, inputrec->opts.nFreeze,
                               md->invmass, md->ptype,
                               md->cFREEZE, md->cACC, md->cTC,
                               state->x, xprime, state->v, force,
-                              inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t);
+                              inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t,
+                              step, inputrec->ld_seed, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
                 break;
             case (eiSD2):
                 /* The SD update is done in 2 parts, because an extra constraint step
                  * is needed
                  */
-                do_update_sd2(upd->sd, upd->sd->gaussrand[th],
+                do_update_sd2(upd->sd,
                               bInitStep, start_th, end_th,
                               inputrec->opts.acc, inputrec->opts.nFreeze,
                               md->invmass, md->ptype,
                               md->cFREEZE, md->cACC, md->cTC,
                               state->x, xprime, state->v, force, state->sd_X,
                               inputrec->opts.tau_t,
-                              TRUE);
+                              TRUE, step, inputrec->ld_seed,
+                              DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
                 break;
             case (eiBD):
                 do_update_bd(start_th, end_th, dt,
@@ -1978,7 +1884,8 @@ void update_coords(FILE             *fplog,
                              md->cFREEZE, md->cTC,
                              state->x, xprime, state->v, force,
                              inputrec->bd_fric,
-                             upd->sd->bd_rf, upd->sd->gaussrand[th]);
+                             upd->sd->bd_rf,
+                             step, inputrec->ld_seed, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
                 break;
             case (eiVV):
             case (eiVVAK):
@@ -2066,31 +1973,23 @@ void correct_ekin(FILE *log, int start, int end, rvec v[], rvec vcm, real mass[]
             mv[XX], mv[YY], mv[ZZ]);
 }
 
-extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_int64_t step, t_mdatoms *md, t_state *state, gmx_update_t upd, t_idef *idef, gmx_constr_t constr)
+extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_int64_t step, const t_commrec *cr,
+                                            t_mdatoms *md, t_state *state, gmx_update_t upd, gmx_constr_t constr)
 {
 
     int  i;
     real rate = (ir->delta_t)/ir->opts.tau_t[0];
+
+    if (ir->etc == etcANDERSEN && constr != NULL)
+    {
+        gmx_fatal(FARGS, "Normal Andersen is currently not supported with constraints, use massive Andersen instead");
+    }
+
     /* proceed with andersen if 1) it's fixed probability per
        particle andersen or 2) it's massive andersen and it's tau_t/dt */
     if ((ir->etc == etcANDERSEN) || do_per_step(step, (int)(1.0/rate)))
     {
-        srenew(upd->randatom, state->nalloc);
-        srenew(upd->randatom_list, state->nalloc);
-        if (upd->randatom_list_init == FALSE)
-        {
-            for (i = 0; i < state->nalloc; i++)
-            {
-                upd->randatom[i]      = FALSE;
-                upd->randatom_list[i] = 0;
-            }
-            upd->randatom_list_init = TRUE;
-        }
-        andersen_tcoupl(ir, md, state, upd->sd->gaussrand[0], rate,
-                        (ir->etc == etcANDERSEN) ? idef : NULL,
-                        constr ? get_nblocks(constr) : 0,
-                        constr ? get_sblock(constr) : NULL,
-                        upd->randatom, upd->randatom_list,
+        andersen_tcoupl(ir, step, cr, md, state, rate,
                         upd->sd->randomize_group, upd->sd->boltzfac);
         return TRUE;
     }
diff --git a/src/gromacs/random/random.c b/src/gromacs/random/random.c
index ab103a9d7a..b70fddfcde 100644
--- a/src/gromacs/random/random.c
+++ b/src/gromacs/random/random.c
@@ -51,6 +51,8 @@
 #include <process.h>
 #endif
 
+#include "external/Random123-1.08/include/Random123/threefry.h"
+
 #include "gromacs/math/utilities.h"
 #include "random_gausstable.h"
 
@@ -60,15 +62,15 @@
 #define RNG_UPPER_MASK 0x80000000UL /* most significant w-r bits */
 #define RNG_LOWER_MASK 0x7fffffffUL /* least significant r bits */
 
-/* Note that if you change the size of the Gaussian table you will also
- * have to generate new initialization data for the table in
+/* Note that if you change the size of the Gaussian table you will
+ * also have to generate new initialization data for the table in
  * gmx_random_gausstable.h
  *
  * A routine print_gaussian_table() is in contrib/random.c
  * for convenience - use it if you need a different size of the table.
  */
 #define GAUSS_TABLE 14 /* the size of the gauss table is 2^GAUSS_TABLE */
-#define GAUSS_SHIFT (32 - GAUSS_TABLE)
+#define GAUSS_MASK  ((1 << GAUSS_TABLE) - 1)
 
 
 struct gmx_rng {
@@ -387,10 +389,8 @@ gmx_rng_uniform_real(gmx_rng_t rng)
      * we are limited to an accuracy of 1e-7.
      */
 }
-
-
-
 real
+
 gmx_rng_gaussian_table(gmx_rng_t rng)
 {
     unsigned int i;
@@ -398,5 +398,52 @@ gmx_rng_gaussian_table(gmx_rng_t rng)
     i = gmx_rng_uniform_uint32(rng);
 
     /* The Gaussian table is a static constant in this file */
-    return gaussian_table[i >> GAUSS_SHIFT];
+    return gaussian_table[i >> (32 - GAUSS_TABLE)];
+}
+
+void
+gmx_rng_cycle_2uniform(gmx_int64_t ctr1, gmx_int64_t ctr2,
+                       gmx_int64_t key1, gmx_int64_t key2,
+                       double* rnd)
+{
+    const gmx_int64_t  mask_53bits     = 0x1FFFFFFFFFFFFF;
+    const double       two_power_min53 = 1.0/9007199254740992.0;
+
+    threefry2x64_ctr_t ctr  = {{ctr1, ctr2}};
+    threefry2x64_key_t key  = {{key1, key2}};
+    threefry2x64_ctr_t rand = threefry2x64(ctr, key);
+
+    rnd[0] = (rand.v[0] & mask_53bits)*two_power_min53;
+    rnd[1] = (rand.v[1] & mask_53bits)*two_power_min53;
+}
+
+void
+gmx_rng_cycle_3gaussian_table(gmx_int64_t ctr1, gmx_int64_t ctr2,
+                              gmx_int64_t key1, gmx_int64_t key2,
+                              real* rnd)
+{
+    threefry2x64_ctr_t ctr  = {{ctr1, ctr2}};
+    threefry2x64_key_t key  = {{key1, key2}};
+    threefry2x64_ctr_t rand = threefry2x64(ctr, key);
+
+    rnd[0] = gaussian_table[(rand.v[0] >> 48) & GAUSS_MASK];
+    rnd[1] = gaussian_table[(rand.v[0] >> 32) & GAUSS_MASK];
+    rnd[2] = gaussian_table[(rand.v[0] >> 16) & GAUSS_MASK];
+}
+
+void
+gmx_rng_cycle_6gaussian_table(gmx_int64_t ctr1, gmx_int64_t ctr2,
+                              gmx_int64_t key1, gmx_int64_t key2,
+                              real* rnd)
+{
+    threefry2x64_ctr_t ctr  = {{ctr1, ctr2}};
+    threefry2x64_key_t key  = {{key1, key2}};
+    threefry2x64_ctr_t rand = threefry2x64(ctr, key);
+
+    rnd[0] = gaussian_table[(rand.v[0] >> 48) & GAUSS_MASK];
+    rnd[1] = gaussian_table[(rand.v[0] >> 32) & GAUSS_MASK];
+    rnd[2] = gaussian_table[(rand.v[0] >> 16) & GAUSS_MASK];
+    rnd[3] = gaussian_table[(rand.v[1] >> 48) & GAUSS_MASK];
+    rnd[4] = gaussian_table[(rand.v[1] >> 32) & GAUSS_MASK];
+    rnd[5] = gaussian_table[(rand.v[1] >> 16) & GAUSS_MASK];
 }
diff --git a/src/gromacs/random/random.h b/src/gromacs/random/random.h
index 739aaf71c4..88dad71fcf 100644
--- a/src/gromacs/random/random.h
+++ b/src/gromacs/random/random.h
@@ -45,6 +45,14 @@
 extern "C" {
 #endif
 
+/*! Fixed random number seeds for different types of RNG */
+#define RND_SEED_UPDATE    1 /*!< For coordinate update (sd, bd, ..) */
+#define RND_SEED_REPLEX    2 /*!< For replica exchange */
+#define RND_SEED_VRESCALE  3 /*!< For V-rescale thermostat */
+#define RND_SEED_ANDERSEN  4 /*!< For Andersen thermostat */
+#define RND_SEED_TPI       5 /*!< For test particle insertion */
+#define RND_SEED_EXPANDED  6 /*!< For expanded emseble methods */
+
 /*! \brief Abstract datatype for a random number generator
  *
  * This is a handle to the full state of a random number generator.
@@ -260,6 +268,54 @@ gmx_rng_gaussian_real(gmx_rng_t rng);
 real
 gmx_rng_gaussian_table(gmx_rng_t rng);
 
+
+/* The stateless cycle based random number generators below,
+ * which all use threefry2x64, take the following arguments:
+ *
+ * ctr1: In mdrun the step counter, in tools the frame(-step)
+ *       counter, so we can ensure reproducible results, even
+ *       we starting at different steps/frames. Might need to be
+ *       multiplied by a constant if we need more random numbers.
+ * ctr2: A local counter, in mdrun often a global atom index.
+ *       If any algorithm needs a variable number of random numbers,
+ *       the second counter is usually a function of the local
+ *       counter.
+ * key1: A user provided random seed.
+ * key2: A fixed seed which is particular for the algorithm,
+ *       as defined at the top of this file, to ensure different
+ *       random sequences when the same user seed is used for
+ *       different algorithms.
+ */
+
+/* Return two uniform random numbers with 2^53 equally
+ * probable values between 0 and 1 - 2^-53.
+ * It uses a stateless counter based random number generator
+ * (threefry2x64).
+ */
+void
+gmx_rng_cycle_2uniform(gmx_int64_t ctr1, gmx_int64_t ctr2,
+                       gmx_int64_t key1, gmx_int64_t key2,
+                       double* rnd);
+
+/* Return three Gaussian random numbers with expectation value
+ * 0.0 and standard deviation 1.0. This routine uses a table
+ * lookup for maximum speed. It uses a stateless counter
+ * based random number generator (threefry2x64). See
+ * gmx_rng_gaussian_table for a warning about accuracy of the table.
+ *
+ * threadsafe: yes
+ */
+void
+gmx_rng_cycle_3gaussian_table(gmx_int64_t ctr1, gmx_int64_t ctr2,
+                              gmx_int64_t key1, gmx_int64_t key2,
+                              real* rnd);
+
+/* As gmx_rng_3gaussian_table, but returns 6 Gaussian numbers. */
+void
+gmx_rng_cycle_6gaussian_table(gmx_int64_t ctr1, gmx_int64_t ctr2,
+                              gmx_int64_t key1, gmx_int64_t key2,
+                              real* rnd);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gromacs/tools/convert_tpr.c b/src/gromacs/tools/convert_tpr.c
index 423639082e..c3297ed7cc 100644
--- a/src/gromacs/tools/convert_tpr.c
+++ b/src/gromacs/tools/convert_tpr.c
@@ -51,7 +51,6 @@
 #include "gromacs/commandline/pargs.h"
 #include "vec.h"
 #include "mtop_util.h"
-#include "gromacs/random/random.h"
 #include "checkpoint.h"
 #include "gromacs/fileio/tpxio.h"
 #include "gromacs/fileio/trnio.h"
@@ -464,13 +463,6 @@ int gmx_convert_tpr(int argc, char *argv[])
                     "If you want that, supply a checkpoint file to mdrun\n\n");
         }
 
-        if (EI_SD(ir->eI) || ir->eI == eiBD)
-        {
-            fprintf(stderr, "\nChanging ld-seed from %d ", ir->ld_seed);
-            ir->ld_seed = (int)gmx_rng_make_seed();
-            fprintf(stderr, "to %d\n\n", ir->ld_seed);
-        }
-
         frame_fn = ftp2fn(efTRN, NFILE, fnm);
 
         if (fn2ftp(frame_fn) == efCPT)
diff --git a/src/programs/mdrun/md.c b/src/programs/mdrun/md.c
index 04a4f7bb63..3ae70f5038 100644
--- a/src/programs/mdrun/md.c
+++ b/src/programs/mdrun/md.c
@@ -182,7 +182,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     gmx_update_t      upd   = NULL;
     t_graph          *graph = NULL;
     globsig_t         gs;
-    gmx_rng_t         mcrng = NULL;
     gmx_groups_t     *groups;
     gmx_ekindata_t   *ekind, *ekind_save;
     gmx_shellfc_t     shellfc;
@@ -413,7 +412,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
     if (ir->bExpanded)
     {
-        init_expanded_ensemble(bStateFromCP, ir, &mcrng, &state->dfhist);
+        init_expanded_ensemble(bStateFromCP, ir, &state->dfhist);
     }
 
     if (MASTER(cr))
@@ -438,17 +437,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         update_energyhistory(&state_global->enerhist, mdebin);
     }
 
-    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG))
-    {
-        /* Set the random state if we read a checkpoint file */
-        set_stochd_state(upd, state);
-    }
-
-    if (state->flags & (1<<estMC_RNG))
-    {
-        set_mc_state(mcrng, state);
-    }
-
     /* Initialize constraints */
     if (constr && !DOMAINDECOMP(cr))
     {
@@ -1287,7 +1275,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                statistics, but if performing simulated tempering, we
                do update the velocities and the tau_t. */
 
-            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, &state->dfhist, step, mcrng, state->v, mdatoms);
+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, &state->dfhist, step, state->v, mdatoms);
             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
             copy_df_history(&state_global->dfhist, &state->dfhist);
         }
@@ -1297,9 +1285,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
          * the update.
          */
         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
-                                 ir, state, state_global, top_global, fr, upd,
+                                 ir, state, state_global, top_global, fr,
                                  outf, mdebin, ekind, f, f_global,
-                                 wcycle, mcrng, &nchkpt,
+                                 wcycle, &nchkpt,
                                  bCPT, bRerunMD, bLastStep, (Flags & MD_CONFOUT),
                                  bSumEkinhOld);
 
@@ -1408,12 +1396,12 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         {
             if (!bInitStep)
             {
-                update_tcouple(step, ir, state, ekind, upd, &MassQ, mdatoms);
+                update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
             }
             if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
             {
                 gmx_bool bIfRandomize;
-                bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr);
+                bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state, upd, constr);
                 /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
                 if (constr && bIfRandomize)
                 {
@@ -1486,7 +1474,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                 }
                 else
                 {
-                    update_tcouple(step, ir, state, ekind, upd, &MassQ, mdatoms);
+                    update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
                     update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
                 }
 
diff --git a/src/programs/mdrun/repl_ex.c b/src/programs/mdrun/repl_ex.c
index 409f525ab0..21a5426c09 100644
--- a/src/programs/mdrun/repl_ex.c
+++ b/src/programs/mdrun/repl_ex.c
@@ -49,6 +49,7 @@
 #include "vec.h"
 #include "names.h"
 #include "domdec.h"
+#include "gromacs/random/random.h"
 
 #define PROBABILITYCUTOFF 100
 /* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
@@ -974,18 +975,22 @@ test_for_replica_exchange(FILE                 *fplog,
     if (bMultiEx)
     {
         /* multiple random switch exchange */
-        for (i = 0; i < re->nex; i++)
+        int nself = 0;
+        for (i = 0; i < re->nex + nself; i++)
         {
+            double rnd[2];
+
+            gmx_rng_cycle_2uniform(step, i*2, re->seed, RND_SEED_REPLEX, rnd);
             /* randomly select a pair  */
             /* in theory, could reduce this by identifying only which switches had a nonneglibible
                probability of occurring (log p > -100) and only operate on those switches */
             /* find out which state it is from, and what label that state currently has. Likely
                more work that useful. */
-            i0 = (int)(re->nrepl*gmx_rng_uniform_real(re->rng));
-            i1 = (int)(re->nrepl*gmx_rng_uniform_real(re->rng));
+            i0 = (int)(re->nrepl*rnd[0]);
+            i1 = (int)(re->nrepl*rnd[1]);
             if (i0 == i1)
             {
-                i--;
+                nself++;
                 continue;  /* self-exchange, back up and do it again */
             }
 
@@ -1021,7 +1026,8 @@ test_for_replica_exchange(FILE                 *fplog,
                     prob[0] = exp(-delta);
                 }
                 /* roll a number to determine if accepted */
-                bEx[0] = (gmx_rng_uniform_real(re->rng) < prob[0]);
+                gmx_rng_cycle_2uniform(step, i*2+1, re->seed, RND_SEED_REPLEX, rnd);
+                bEx[0] = rnd[0] < prob[0];
             }
             re->prob_sum[0] += prob[0];
 
@@ -1039,6 +1045,7 @@ test_for_replica_exchange(FILE                 *fplog,
     else
     {
         /* standard nearest neighbor replica exchange */
+
         m = (step / re->nst) % 2;
         for (i = 1; i < re->nrepl; i++)
         {
@@ -1057,6 +1064,8 @@ test_for_replica_exchange(FILE                 *fplog,
                 }
                 else
                 {
+                    double rnd[2];
+
                     if (delta > PROBABILITYCUTOFF)
                     {
                         prob[i] = 0;
@@ -1066,7 +1075,8 @@ test_for_replica_exchange(FILE                 *fplog,
                         prob[i] = exp(-delta);
                     }
                     /* roll a number to determine if accepted */
-                    bEx[i] = (gmx_rng_uniform_real(re->rng) < prob[i]);
+                    gmx_rng_cycle_2uniform(step, i, re->seed, RND_SEED_REPLEX, rnd);
+                    bEx[i] = rnd[0] < prob[i];
                 }
                 re->prob_sum[i] += prob[i];
 
diff --git a/src/programs/mdrun/runner.c b/src/programs/mdrun/runner.c
index 9d6211b036..767d0bb875 100644
--- a/src/programs/mdrun/runner.c
+++ b/src/programs/mdrun/runner.c
@@ -1097,7 +1097,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     int                       i, m, nChargePerturbed = -1, nTypePerturbed = 0, status, nalloc;
     char                     *gro;
     gmx_wallcycle_t           wcycle;
-    gmx_bool                  bReadRNG, bReadEkin;
+    gmx_bool                  bReadEkin;
     int                       list;
     gmx_walltime_accounting_t walltime_accounting = NULL;
     int                       rc;
@@ -1299,7 +1299,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     }
 
     /* now make sure the state is initialized and propagated */
-    set_state_entries(state, inputrec, cr->nnodes);
+    set_state_entries(state, inputrec);
 
     /* A parallel command line option consistency check that we can
        only do after any threads have started. */
@@ -1397,14 +1397,10 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
         {
             load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
                             cr, ddxyz,
-                            inputrec, state, &bReadRNG, &bReadEkin,
+                            inputrec, state, &bReadEkin,
                             (Flags & MD_APPENDFILES),
                             (Flags & MD_APPENDFILESSET));
 
-            if (bReadRNG)
-            {
-                Flags |= MD_READ_RNG;
-            }
             if (bReadEkin)
             {
                 Flags |= MD_READ_EKIN;
diff --git a/tests/CppCheck.cmake b/tests/CppCheck.cmake
index dbb33e2e82..918b6304d4 100644
--- a/tests/CppCheck.cmake
+++ b/tests/CppCheck.cmake
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2013, by the GROMACS development team, led by
+# Copyright (c) 2013,2014, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -84,6 +84,7 @@ if (CPPCHECK_EXECUTABLE AND UNIX)
         --suppress=invalidscanf
         --suppress=sizeofCalculation
         --suppress=missingInclude:src/programs/mdrun/gmx_gpu_utils/gmx_gpu_utils.cu
+	--suppress=*:src/external/Random123-1.08/include/Random123/features/compilerfeatures.h
         --inline-suppr)
     set(_cxx_flags
         -D__cplusplus
-- 
2.22.0