2 This source code file is part of thread_mpi.
3 Written by Sander Pronk, Erik Lindahl, and possibly others.
5 Copyright (c) 2009, Sander Pronk, Erik Lindahl.
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions are met:
10 1) Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2) Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3) Neither the name of the copyright holders nor the
16 names of its contributors may be used to endorse or promote products
17 derived from this software without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
20 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
23 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 If you want to redistribute modifications, please consider that
31 scientific software is very special. Version control is crucial -
32 bugs must be traceable. We will be happy to consider code for
33 inclusion in the official distribution, but derived work should not
34 be called official thread_mpi. Details are found in the README & COPYING
42 /* This code is executed for x86 and x86-64, with these compilers:
46 * All these support GCC-style inline assembly.
47 * We also use this section for the documentation.
52 /* Only gcc and Intel support this check, otherwise set it to true (skip doc) */
53 #if (!defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined DOXYGEN)
54 #define __builtin_constant_p(i) (1)
58 /* we put all of these on their own cache line by padding the data structure
59 to the size of a cache line on x86 (64 bytes): */
60 #define TMPI_SIZEOF_X86_CACHE_LINE 64
61 typedef struct tMPI_Atomic
64 char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(int)];
67 typedef struct tMPI_Atomic_ptr
70 char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(void*)];
73 typedef struct tMPI_Spinlock
76 char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(unsigned int)];
80 #define TMPI_SPINLOCK_INITIALIZER { 0 }
82 #define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
86 /* these are guaranteed to be atomic on x86 and x86_64 */
87 #define tMPI_Atomic_get(a) ((a)->value)
88 #define tMPI_Atomic_set(a, i) (((a)->value) = (i))
90 #define tMPI_Atomic_ptr_get(a) ((a)->value)
91 #define tMPI_Atomic_ptr_set(a, i) (((a)->value) = (void*)(i))
96 We disable this for 32-bit builds because the target may be 80386,
97 which didn't have cmpxchg, etc (they were introduced as only as 'recently'
98 as the 486, and gcc on some Linux versions still target 80386 by default).
100 We also specifically check for icc, because intrinsics are not always
103 llvm has issues with inline assembly and also in 32 bits has support for
104 the gcc intrinsics */
105 #if ( ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) && \
106 !defined(__INTEL_COMPILER) ) || defined(__llvm__) )
107 #include "gcc_intrinsics.h"
110 /* older versions of gcc don't support atomic intrinsics */
113 #define tMPI_Atomic_memory_barrier() __asm__ __volatile__("sfence;" : : : "memory")
115 /* MIC is in-order and does not need nor support sfense */
116 #define tMPI_Atomic_memory_barrier() __asm__ __volatile__("" ::: "memory")
119 #define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
120 static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
122 volatile int res = i;
123 /* volatile because we read and write back to the same variable in the
124 asm section. some compilers requires this to be volatile */
125 __asm__ __volatile__("lock ; xaddl %0, %1;" /* swap-add */
126 : "=r" (res) /* with register as
128 : "m" (a->value), "0" (res) /* and memory as input */
133 #define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
134 static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
137 volatile int res = i;
139 __asm__ __volatile__("lock ; xaddl %0, %1;"
141 : "m" (a->value), "0" (res)
143 return res + orig; /* then add again from the right value */
148 static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
152 __asm__ __volatile__("lock ; cmpxchgl %1,%2"
154 : "q" (newval), "m" (a->value), "0" (oldval)
157 return prev == oldval;
160 static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t *a,
166 __asm__ __volatile__("lock ; cmpxchgl %1,%2"
168 : "q" (newval), "m" (a->value), "0" (oldval)
171 __asm__ __volatile__("lock ; cmpxchgq %1,%2"
173 : "q" (newval), "m" (a->value), "0" (oldval)
176 return prev == oldval;
179 #endif /* end of check for gcc intrinsics */
182 #define TMPI_ATOMIC_HAVE_NATIVE_SWAP
183 /* do the swap fns; we told the intrinsics that we have them. */
184 static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
186 volatile int ret = b;
187 __asm__ __volatile__("\txchgl %0, %1;"
188 : "+r" (ret), "+m" (a->value)
194 static inline void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
196 void *volatile *ret = (void* volatile*)b;
198 __asm__ __volatile__("\txchgl %0, %1;"
199 : "+r" (ret), "+m" (a->value)
204 __asm__ __volatile__("\txchgq %0, %1;"
205 : "+r" (ret), "+m" (a->value)
216 static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
223 static inline void tMPI_Spinlock_lock(tMPI_Spinlock_t *x)
225 /* this is a spinlock with a double loop, as recommended by Intel
226 it pauses in the outer loop (the one that just checks for the
227 availability of the lock), and thereby reduces bus contention and
228 prevents the pipeline from flushing. */
229 __asm__ __volatile__("1:\tcmpl $0, %0\n" /* check the lock */
230 "\tje 2f\n" /* try to lock if it is
231 free by jumping forward */
232 "\tpause\n" /* otherwise: small pause
233 as recommended by Intel */
234 "\tjmp 1b\n" /* and jump back */
236 "2:\tmovl $1, %%eax\n" /* set eax to 1, the locked
238 "\txchgl %%eax, %0\n" /* atomically exchange
239 eax with the lock value */
240 "\tcmpl $0, %%eax\n" /* compare the exchanged
242 "\tjne 1b" /* jump backward if we didn't
244 : "+m" (x->lock) /* input & output var */
246 : "%eax", "memory" /* we changed memory */
252 static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *x)
254 /* this is apparently all that is needed for unlocking a lock */
255 __asm__ __volatile__(
257 : "=m" (x->lock) : : "memory" );
262 static inline int tMPI_Spinlock_trylock(tMPI_Spinlock_t *x)
266 __asm__ __volatile__("\tmovl %2, %0\n" /* set eax to 1, the locked
268 "\txchgl %0, %1\n" /* atomically exchange
269 eax with the address in
271 : "+r" (old_value), "+m" (x->lock)
279 static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
281 return ( (*((volatile int*)(&(x->lock)))) != 0);
285 static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
287 /* this is the spinlock without the xchg. */
288 __asm__ __volatile__("1:\tcmpl $0, %0\n" /* check the lock */
289 "\tje 2f\n" /* try to lock if it is
290 free by jumping forward */
291 "\tpause\n" /* otherwise: small pause
292 as recommended by Intel */
293 "\tjmp 1b\n" /* and jump back */
294 "2:\tnop\n" /* jump target for end
296 : "+m" (x->lock) /* input & output var */
298 : "memory" /* we changed memory */
303 tMPI_Atomic_memory_barrier();
305 while (tMPI_Spinlock_islocked(x));