/* we put all of these on their own cache line by padding the data structure
to the size of a cache line on x86 (64 bytes): */
+#define TMPI_SIZEOF_X86_CACHE_LINE 64
typedef struct tMPI_Atomic
{
int value;
- char padding[64-sizeof(int)];
+ char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(int)];
} tMPI_Atomic_t;
typedef struct tMPI_Atomic_ptr
{
void* value;
- char padding[64-sizeof(void*)];
+ char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(void*)];
} tMPI_Atomic_ptr_t;
typedef struct tMPI_Spinlock
{
unsigned int lock;
- char padding[64-sizeof(unsigned int)];
+ char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(unsigned int)];
} tMPI_Spinlock_t;
as the 486, and gcc on some Linux versions still target 80386 by default).
We also specifically check for icc, because intrinsics are not always
- supported there. */
-#if ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) && \
- !defined(__INTEL_COMPILER) )
+ supported there.
+
+ llvm has issues with inline assembly and also in 32 bits has support for
+ the gcc intrinsics */
+#if ( ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) && \
+ !defined(__INTEL_COMPILER) ) || defined(__llvm__) )
#include "gcc_intrinsics.h"
#else
__asm__ __volatile__("lock ; xaddl %0, %1;"
:"=r"(i) :"m"(a->value), "0"(i) : "memory");
return i + __i;
-}
+}
static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
{
static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
{
- unsigned int prev;
+ int prev;
__asm__ __volatile__("lock ; cmpxchgl %1,%2"
: "=a"(prev)
/* We tried again, and this time there was a copied buffer.
We use that, and indicate that we're not reading from the
regular buf. This case should be pretty rare. */
- tMPI_Atomic_fetch_add(&(cev->met[rank].buf_readcount),-1);
+ tMPI_Atomic_add_return(&(cev->met[rank].buf_readcount),-1);
tMPI_Atomic_memory_barrier_acq();
srcbuf=try_again_srcbuf;
}
{
/* we decrement the read count; potentially releasing the buffer. */
tMPI_Atomic_memory_barrier_rel();
- tMPI_Atomic_fetch_add( &(cev->met[rank].buf_readcount), -1);
+ tMPI_Atomic_add_return( &(cev->met[rank].buf_readcount), -1);
}
#endif
}
else
{
/* wait until everybody else is done copying the original buffer.
- We use fetch_add because we want to be sure of coherency.
+ We use atomic add-return because we want to be sure of coherency.
This wait is bound to be very short (otherwise it wouldn't
be double-buffering) so we always spin here. */
/*tMPI_Atomic_memory_barrier_rel();*/
-100000))
#endif
#if 0
- while (tMPI_Atomic_fetch_add( &(cev->met[myrank].buf_readcount), 0)
+ while (tMPI_Atomic_add_return( &(cev->met[myrank].buf_readcount), 0)
!= 0)
#endif
#if 1
tMPI_Atomic_memory_barrier_rel();
/* signal that we're done */
- tMPI_Atomic_fetch_add(&(cev->coll.current_sync), 1);
+ tMPI_Atomic_add_return(&(cev->coll.current_sync), 1);
/* we need to keep being in sync */
csync->syncs++;
}
tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
}
}
- /* the main thread now also runs start_fn if we don't want
+ /* the main thread also runs start_fn if we don't want
it to return */
if (!main_returns)
tMPI_Thread_starter((void*)&(threads[0]));
tMPI_Trace_print("tMPI_Init(%p, %p, %p)", argc, argv, start_function);
#endif
-
if (TMPI_COMM_WORLD==0) /* we're the main process */
{
int N=0;
tMPI_Get_N(argc, argv, "-nt", &N);
- tMPI_Start_threads(FALSE, N, TMPI_AFFINITY_ALL_CORES, argc, argv,
+ tMPI_Start_threads(TRUE, N, TMPI_AFFINITY_ALL_CORES, argc, argv,
NULL, NULL, start_function);
}
else