src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h

   1 /*
   2    This source code file is part of thread_mpi.
   3    Written by Sander Pronk, Erik Lindahl, and possibly others.
   4
   5    Copyright (c) 2009, Sander Pronk, Erik Lindahl.
   6    All rights reserved.
   7
   8    Redistribution and use in source and binary forms, with or without
   9    modification, are permitted provided that the following conditions are met:
  10    1) Redistributions of source code must retain the above copyright
  11    notice, this list of conditions and the following disclaimer.
  12    2) Redistributions in binary form must reproduce the above copyright
  13    notice, this list of conditions and the following disclaimer in the
  14    documentation and/or other materials provided with the distribution.
  15    3) Neither the name of the copyright holders nor the
  16    names of its contributors may be used to endorse or promote products
  17    derived from this software without specific prior written permission.
  18
  19    THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
  20    EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  21    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22    DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
  23    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  24    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  25    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  26    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  28    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30    If you want to redistribute modifications, please consider that
  31    scientific software is very special. Version control is crucial -
  32    bugs must be traceable. We will be happy to consider code for
  33    inclusion in the official distribution, but derived work should not
  34    be called official thread_mpi. Details are found in the README & COPYING
  35    files.
  36  */
  37
  38
  39
  40 #include <limits.h>
  41 #include <stdint.h>
  42 /* This code is executed for x86 and x86-64, with these compilers:
  43  * GNU
  44  * Intel
  45  * Pathscale
  46  * All these support GCC-style inline assembly.
  47  * We also use this section for the documentation.
  48  */
  49
  50
  51 #if 0
  52 /* Only gcc and Intel support this check, otherwise set it to true (skip doc) */
  53 #if (!defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined DOXYGEN)
  54 #define __builtin_constant_p(i) (1)
  55 #endif
  56 #endif
  57
  58 /* we put all of these on their own cache line by padding the data structure
  59    to the size of a cache line on x86 (64 bytes): */
  60 #define TMPI_SIZEOF_X86_CACHE_LINE 64
  61 typedef struct tMPI_Atomic
  62 {
  63     int  value;
  64     char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(int)];
  65 } tMPI_Atomic_t;
  66
  67 typedef struct tMPI_Atomic_ptr
  68 {
  69     void* value;
  70     char  padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(void*)];
  71 } tMPI_Atomic_ptr_t;
  72
  73 typedef struct tMPI_Spinlock
  74 {
  75     unsigned int lock;
  76     char         padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(unsigned int)];
  77 } tMPI_Spinlock_t;
  78
  79
  80 #define TMPI_SPINLOCK_INITIALIZER   { 0 }
  81
  82 #define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
  83
  84
  85
  86 /* these are guaranteed to be  atomic on x86 and x86_64 */
  87 #define tMPI_Atomic_get(a)  ((a)->value)
  88 #define tMPI_Atomic_set(a, i)  (((a)->value) = (i))
  89
  90 #define tMPI_Atomic_ptr_get(a)  ((a)->value)
  91 #define tMPI_Atomic_ptr_set(a, i)  (((a)->value) = (void*)(i))
  92
  93
  94 /* do the intrinsics.
  95
  96    We disable this for 32-bit builds because the target may be 80386,
  97    which didn't have cmpxchg, etc (they were introduced as only as 'recently'
  98    as the 486, and gcc on some Linux versions still target 80386 by default).
  99
 100    We also specifically check for icc, because intrinsics are not always
 101    supported there.
 102
 103    llvm has issues with inline assembly and also in 32 bits has support for
 104    the gcc intrinsics */
 105 #if ( ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) &&  \
 106     !defined(__INTEL_COMPILER) )  || defined(__llvm__) )
 107 #include "gcc_intrinsics.h"
 108
 109 #else
 110 /* older versions of gcc don't support atomic intrinsics */
 111
 112 #ifndef __MIC__
 113 #define tMPI_Atomic_memory_barrier() __asm__ __volatile__("sfence;" : : : "memory")
 114 #else
 115 /* MIC is in-order and does not need nor support sfense */
 116 #define tMPI_Atomic_memory_barrier() __asm__ __volatile__("" ::: "memory")
 117 #endif
 118
 119 #define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
 120 static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
 121 {
 122     volatile int res = i;
 123     /* volatile because we read and write back to the same variable in the
 124        asm section.  some compilers requires this to be volatile */
 125     __asm__ __volatile__("lock ; xaddl %0, %1;"      /* swap-add */
 126                          : "=r" (res)                /* with register as
 127                                                         output*/
 128                          : "m" (a->value), "0" (res) /* and memory as input */
 129                          : "memory");
 130     return res;
 131 }
 132
 133 #define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
 134 static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
 135 {
 136     int          orig = i;
 137     volatile int res  = i;
 138
 139     __asm__ __volatile__("lock ; xaddl %0, %1;"
 140                          : "=r" (res)
 141                          : "m" (a->value), "0" (res)
 142                          :  "memory");
 143     return res + orig; /* then add again from the right value */
 144 }
 145
 146
 147
 148 static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
 149 {
 150     int prev;
 151
 152     __asm__ __volatile__("lock ; cmpxchgl %1,%2"
 153                          : "=a" (prev)
 154                          : "q" (newval), "m" (a->value), "0" (oldval)
 155                          : "memory");
 156
 157     return prev == oldval;
 158 }
 159
 160 static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t *a,
 161                                       void              *oldval,
 162                                       void              *newval)
 163 {
 164     void* prev;
 165 #ifndef __x86_64__
 166     __asm__ __volatile__("lock ; cmpxchgl %1,%2"
 167                          : "=a" (prev)
 168                          : "q" (newval), "m" (a->value), "0" (oldval)
 169                          : "memory");
 170 #else
 171     __asm__ __volatile__("lock ; cmpxchgq %1,%2"
 172                          : "=a" (prev)
 173                          : "q" (newval), "m" (a->value), "0" (oldval)
 174                          : "memory");
 175 #endif
 176     return prev == oldval;
 177 }
 178
 179 #endif /* end of check for gcc intrinsics */
 180
 181
 182 #define TMPI_ATOMIC_HAVE_NATIVE_SWAP
 183 /* do the swap fns; we told the intrinsics that we have them. */
 184 static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
 185 {
 186     volatile int ret = b;
 187     __asm__ __volatile__("\txchgl %0, %1;"
 188                          : "+r" (ret), "+m" (a->value)
 189                          :
 190                          : "memory");
 191     return (int)ret;
 192 }
 193
 194 static inline void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
 195 {
 196     void *volatile *ret = (void* volatile*)b;
 197 #ifndef __x86_64__
 198     __asm__ __volatile__("\txchgl %0, %1;"
 199                          : "+r" (ret), "+m" (a->value)
 200                          :
 201                          : "memory");
 202
 203 #else
 204     __asm__ __volatile__("\txchgq %0, %1;"
 205                          : "+r" (ret), "+m" (a->value)
 206                          :
 207                          : "memory");
 208 #endif
 209     return (void*)ret;
 210 }
 211
 212
 213
 214 /* spinlocks : */
 215
 216 static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
 217 {
 218     x->lock = 0;
 219 }
 220
 221
 222
 223 static inline void tMPI_Spinlock_lock(tMPI_Spinlock_t *x)
 224 {
 225     /* this is a spinlock with a double loop, as recommended by Intel
 226        it pauses in the outer loop (the one that just checks for the
 227        availability of the lock), and thereby reduces bus contention and
 228        prevents the pipeline from flushing. */
 229     __asm__ __volatile__("1:\tcmpl $0, %0\n"    /* check the lock */
 230                          "\tje 2f\n"            /* try to lock if it is
 231                                                    free by jumping forward */
 232                          "\tpause\n"            /* otherwise: small pause
 233                                                    as recommended by Intel */
 234                          "\tjmp 1b\n"           /* and jump back */
 235
 236                          "2:\tmovl $1, %%eax\n" /* set eax to 1, the locked
 237                                                    value of the lock */
 238                          "\txchgl %%eax, %0\n"  /* atomically exchange
 239                                                    eax with the lock value */
 240                          "\tcmpl $0, %%eax\n"   /* compare the exchanged
 241                                                    value with 0 */
 242                          "\tjne 1b"             /* jump backward if we didn't
 243                                                    just lock */
 244                          : "+m" (x->lock)       /* input & output var */
 245                          :
 246                          : "%eax", "memory"     /* we changed memory */
 247                          );
 248 }
 249
 250
 251
 252 static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *x)
 253 {
 254     /* this is apparently all that is needed for unlocking a lock */
 255     __asm__ __volatile__(
 256         "\n\tmovl $0, %0\n"
 257         : "=m" (x->lock) : : "memory" );
 258 }
 259
 260
 261
 262 static inline int tMPI_Spinlock_trylock(tMPI_Spinlock_t *x)
 263 {
 264     int old_value = 1;
 265
 266     __asm__ __volatile__("\tmovl %2, %0\n"     /* set eax to 1, the locked
 267                                                   value of the lock */
 268                          "\txchgl %0, %1\n"    /* atomically exchange
 269                                                   eax with the address in
 270                                                   rdx. */
 271                          : "+r" (old_value), "+m" (x->lock)
 272                          : "i" (1)
 273                          : "memory");
 274     return (old_value);
 275 }
 276
 277
 278
 279 static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
 280 {
 281     return ( (*((volatile int*)(&(x->lock)))) != 0);
 282 }
 283
 284
 285 static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
 286 {
 287     /* this is the spinlock without the xchg.  */
 288     __asm__ __volatile__("1:\tcmpl $0, %0\n" /* check the lock */
 289                          "\tje 2f\n"         /* try to lock if it is
 290                                                 free by jumping forward */
 291                          "\tpause\n"         /* otherwise: small pause
 292                                                 as recommended by Intel */
 293                          "\tjmp 1b\n"        /* and jump back */
 294                          "2:\tnop\n"         /* jump target for end
 295                                                 of wait */
 296                          : "+m" (x->lock)    /* input & output var */
 297                          :
 298                          : "memory"          /* we changed memory */
 299                          );
 300 #if 0
 301     do
 302     {
 303         tMPI_Atomic_memory_barrier();
 304     }
 305     while (tMPI_Spinlock_islocked(x));
 306 #endif
 307 }