2 Copyright 2010-2011, D. E. Shaw Research.
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions, and the following disclaimer.
12 * Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions, and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
16 * Neither the name of D. E. Shaw Research nor the names of its
17 contributors may be used to endorse or promote products derived from
18 this software without specific prior written permission.
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 #ifndef _Random123_sse_dot_h__
33 #define _Random123_sse_dot_h__
37 #if R123_USE_X86INTRIN_H
38 #include <x86intrin.h>
40 #if R123_USE_IA32INTRIN_H
41 #include <ia32intrin.h>
43 #if R123_USE_XMMINTRIN_H
44 #include <xmmintrin.h>
46 #if R123_USE_EMMINTRIN_H
47 #include <emmintrin.h>
49 #if R123_USE_SMMINTRIN_H
50 #include <smmintrin.h>
52 #if R123_USE_WMMINTRIN_H
53 #include <wmmintrin.h>
66 /* bit25 of CX tells us whether AES is enabled. */
67 R123_STATIC_INLINE int haveAESNI(){
68 unsigned int eax, ebx, ecx, edx;
69 __asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
73 #elif R123_USE_CPUID_MSVC
74 R123_STATIC_INLINE int haveAESNI(){
77 return (CPUInfo[2]>>25)&1;
79 #else /* R123_USE_CPUID_??? */
80 #warning "No R123_USE_CPUID_XXX method chosen. haveAESNI will always return false"
81 R123_STATIC_INLINE int haveAESNI(){
84 #endif /* R123_USE_ASM_GNU || R123_USE_CPUID_MSVC */
86 // There is a lot of annoying and inexplicable variation in the
87 // SSE intrinsics available in different compilation environments.
88 // The details seem to depend on the compiler, the version and
89 // the target architecture. Rather than insisting on
90 // R123_USE_feature tests for each of these in each of the
91 // compilerfeatures.h files we just keep the complexity localized
93 #if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
94 /* Is there an intrinsic to assemble an __m128i from two 64-bit words?
95 If not, use the 4x32-bit intrisic instead. N.B. It looks like Intel
96 added _mm_set_epi64x to icc version 12.1 in Jan 2012.
98 R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
105 return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
108 /* _mm_extract_lo64 abstracts the task of extracting the low 64-bit
109 word from an __m128i. The _mm_cvtsi128_si64 intrinsic does the job
110 on 64-bit platforms. Unfortunately, both MSVC and Open64 fail
111 assertions in ut_M128.cpp and ut_carray.cpp when we use the
112 _mm_cvtsi128_si64 intrinsic. (See
113 https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug).
114 On 32-bit platforms, there's no MOVQ, so there's no intrinsic.
115 Finally, even if the intrinsic exists, it may be spelled with or
118 #if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
119 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
124 _mm_store_si128(&u.m, si);
127 #elif defined(__llvm__) || defined(__ICC)
128 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
129 return (uint64_t)_mm_cvtsi128_si64(si);
131 #else /* GNUC, others */
132 /* FWIW, gcc's emmintrin.h has had the 'x' spelling
133 since at least gcc-3.4.4. The no-'x' spelling showed up
135 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
136 return (uint64_t)_mm_cvtsi128_si64x(si);
139 #if defined(__GNUC__) && __GNUC__ < 4
140 /* the cast builtins showed up in gcc4. */
141 R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
150 #if R123_USE_CXX11_UNRESTRICTED_UNIONS
151 // C++98 forbids a union member from having *any* constructors.
152 // C++11 relaxes this, and allows union members to have constructors
153 // as long as there is a "trivial" default construtor. So in C++11
154 // we can provide a r123m128i constructor with an __m128i argument, and still
155 // have the default (and hence trivial) default constructor.
156 r123m128i() = default;
157 r123m128i(__m128i _m): m(_m){}
159 r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
160 r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
161 #if R123_USE_CXX11_EXPLICIT_CONVERSIONS
162 // With C++0x we can attach explicit to the bool conversion operator
163 // to disambiguate undesired promotions. For g++, this works
164 // only in 4.5 and above.
165 explicit operator bool() const {return _bool();}
167 // Pre-C++0x, we have to do something else. Google for the "safe bool"
168 // idiom for other ideas...
169 operator const void*() const{return _bool()?this:0;}
171 operator __m128i() const {return m;}
175 bool _bool() const{ return !_mm_testz_si128(m,m); }
177 bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
181 R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
183 __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
184 c = _mm_add_epi64(c, zeroone);
187 __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
188 if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
189 __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
190 c = _mm_add_epi64(c, onezero);
193 unsigned mask = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
194 // The low two bits of mask are 11 iff the low 64 bits of
196 if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
197 __m128i onezero = _mm_set_epi64x(1,0);
198 c = _mm_add_epi64(c, onezero);
204 R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){
206 __m128i incr128 = _mm_set_epi64x(0, n);
207 c = _mm_add_epi64(c, incr128);
208 // return c; // NO CARRY!
210 int64_t lo64 = _mm_extract_lo64(c);
211 if((uint64_t)lo64 < n)
212 c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
217 // We need this one because it's present, but never used in r123array1xm128i::incr
218 R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){
219 throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
221 // The comparisons aren't implemented, but if we leave them out, and
222 // somebody writes, e.g., M1 < M2, the compiler will do an implicit
223 // conversion through void*. Sigh...
224 R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){
225 throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
226 R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){
227 throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
228 R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){
229 throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
230 R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){
231 throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
233 R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){
234 return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
235 R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){
237 R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
238 r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
239 R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
241 R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
246 _mm_storeu_si128(&u.m, m.m);
247 return os << u.u64[0] << " " << u.u64[1];
250 R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
252 is >> u64[0] >> u64[1];
253 m.m = _mm_set_epi64x(u64[1], u64[0]);
257 template<typename T> inline T assemble_from_u32(uint32_t *p32); // forward declaration
260 inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
262 ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
272 #endif /* __cplusplus */
274 #else /* !R123_USE_SSE */
275 R123_STATIC_INLINE int haveAESNI(){
278 #endif /* R123_USE_SSE */
280 #endif /* _Random123_sse_dot_h__ */