2 This source code file is part of thread_mpi.
3 Written by Sander Pronk, Erik Lindahl, and possibly others.
5 Copyright (c) 2009, Sander Pronk, Erik Lindahl.
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions are met:
10 1) Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2) Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3) Neither the name of the copyright holders nor the
16 names of its contributors may be used to endorse or promote products
17 derived from this software without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
20 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
23 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 If you want to redistribute modifications, please consider that
31 scientific software is very special. Version control is crucial -
32 bugs must be traceable. We will be happy to consider code for
33 inclusion in the official distribution, but derived work should not
34 be called official thread_mpi. Details are found in the README & COPYING
42 /* This code is executed for x86 and x86-64, with these compilers:
46 * All these support GCC-style inline assembly.
47 * We also use this section for the documentation.
52 /* Only gcc and Intel support this check, otherwise set it to true (skip doc) */
53 #if (!defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined DOXYGEN)
54 #define __builtin_constant_p(i) (1)
58 /* we put all of these on their own cache line by padding the data structure
59 to the size of a cache line on x86 (64 bytes): */
60 #define TMPI_SIZEOF_X86_CACHE_LINE 64
61 typedef struct tMPI_Atomic
64 char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(int)];
67 typedef struct tMPI_Atomic_ptr
70 char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(void*)];
73 typedef struct tMPI_Spinlock
76 char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(unsigned int)];
80 #define TMPI_SPINLOCK_INITIALIZER { 0 }
84 /* these are guaranteed to be atomic on x86 and x86_64 */
85 #define tMPI_Atomic_get(a) ((a)->value)
86 #define tMPI_Atomic_set(a,i) (((a)->value) = (i))
88 #define tMPI_Atomic_ptr_get(a) ((a)->value)
89 #define tMPI_Atomic_ptr_set(a,i) (((a)->value) = (void*)(i))
94 We disable this for 32-bit builds because the target may be 80386,
95 which didn't have cmpxchg, etc (they were introduced as only as 'recently'
96 as the 486, and gcc on some Linux versions still target 80386 by default).
98 We also specifically check for icc, because intrinsics are not always
101 llvm has issues with inline assembly and also in 32 bits has support for
102 the gcc intrinsics */
103 #if ( ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) && \
104 !defined(__INTEL_COMPILER) ) || defined(__llvm__) )
105 #include "gcc_intrinsics.h"
108 /* older versions of gcc don't support atomic intrinsics */
111 #define tMPI_Atomic_memory_barrier() __asm__ __volatile__("sfence;": : :"memory")
113 static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
118 __asm__ __volatile__("lock ; xaddl %0, %1;"
119 :"=r"(i) :"m"(a->value), "0"(i) : "memory");
123 static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
125 __asm__ __volatile__("lock ; xaddl %0, %1;"
126 :"=r"(i) :"m"(a->value), "0"(i) : "memory");
130 static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
134 __asm__ __volatile__("lock ; cmpxchgl %1,%2"
136 : "q"(newval), "m"(a->value), "0"(oldval)
142 static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t *a,
148 __asm__ __volatile__("lock ; cmpxchgl %1,%2"
150 : "q"(newval), "m"(a->value), "0"(oldval)
153 __asm__ __volatile__("lock ; cmpxchgq %1,%2"
155 : "q"(newval), "m"(a->value), "0"(oldval)
161 #endif /* end of check for gcc intrinsics */
163 #define TMPI_HAVE_SWAP
164 /* do the swap fns; we told the intrinsics that we have them. */
165 static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
168 __asm__ __volatile__("\txchgl %0, %1;"
169 :"+r"(ret), "+m"(a->value)
175 static inline void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
177 void *volatile *ret=(void* volatile*)b;
179 /* __asm__ __volatile__("\txchgl %0, %1;"
180 :"=m"(a->value),"=q"(b)
184 __asm__ __volatile__("\txchgl %0, %1;"
185 :"+r"(ret), "+m"(a->value)
190 __asm__ __volatile__("\txchgq %0, %1;"
191 :"+r"(ret), "+m"(a->value)
202 static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
209 static inline void tMPI_Spinlock_lock(tMPI_Spinlock_t *x)
211 /* this is a spinlock with a double loop, as recommended by Intel
212 it pauses in the outer loop (the one that just checks for the
213 availability of the lock), and thereby reduces bus contention and
214 prevents the pipeline from flushing. */
215 __asm__ __volatile__("1:\tcmpl $0, %0\n" /* check the lock */
216 "\tje 2f\n" /* try to lock if it is
217 free by jumping forward */
218 "\tpause\n" /* otherwise: small pause
219 as recommended by Intel */
220 "\tjmp 1b\n" /* and jump back */
222 "2:\tmovl $1, %%eax\n" /* set eax to 1, the locked
224 "\txchgl %%eax, %0\n" /* atomically exchange
225 eax with the lock value */
226 "\tcmpl $0, %%eax\n" /* compare the exchanged
228 "\tjne 1b" /* jump backward if we didn't
230 : "+m" (x->lock) /* input & output var */
232 : "%eax", "memory"/* we changed memory */
238 static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t * x)
240 /* this is apparently all that is needed for unlocking a lock */
241 __asm__ __volatile__(
243 : "=m"(x->lock) : : "memory" );
248 static inline int tMPI_Spinlock_trylock(tMPI_Spinlock_t *x)
252 __asm__ __volatile__("\tmovl %2, %0\n" /* set eax to 1, the locked
254 "\txchgl %0, %1\n" /* atomically exchange
255 eax with the address in
257 : "+r"(old_value), "+m" (x->lock)
265 static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
267 return ( (*((volatile int*)(&(x->lock)))) != 0);
271 static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
273 /* this is the spinlock without the xchg. */
274 __asm__ __volatile__("1:\tcmpl $0, %0\n" /* check the lock */
275 "\tje 2f\n" /* try to lock if it is
276 free by jumping forward */
277 "\tpause\n" /* otherwise: small pause
278 as recommended by Intel */
279 "\tjmp 1b\n" /* and jump back */
280 "2:\tnop\n" /* jump target for end
282 : "+m"(x->lock) /* input & output var */
284 : "memory"/* we changed memory */
289 tMPI_Atomic_memory_barrier();
291 while(tMPI_Spinlock_islocked(x));