src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h

   1 /*
   2 This source code file is part of thread_mpi.
   3 Written by Sander Pronk, Erik Lindahl, and possibly others.
   4
   5 Copyright (c) 2009, Sander Pronk, Erik Lindahl.
   6 All rights reserved.
   7
   8 Redistribution and use in source and binary forms, with or without
   9 modification, are permitted provided that the following conditions are met:
  10 1) Redistributions of source code must retain the above copyright
  11    notice, this list of conditions and the following disclaimer.
  12 2) Redistributions in binary form must reproduce the above copyright
  13    notice, this list of conditions and the following disclaimer in the
  14    documentation and/or other materials provided with the distribution.
  15 3) Neither the name of the copyright holders nor the
  16    names of its contributors may be used to endorse or promote products
  17    derived from this software without specific prior written permission.
  18
  19 THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
  20 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22 DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
  23 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  26 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 If you want to redistribute modifications, please consider that
  31 scientific software is very special. Version control is crucial -
  32 bugs must be traceable. We will be happy to consider code for
  33 inclusion in the official distribution, but derived work should not
  34 be called official thread_mpi. Details are found in the README & COPYING
  35 files.
  36 */
  37
  38
  39
  40 #include <limits.h>
  41 #include <stdint.h>
  42 /* This code is executed for x86 and x86-64, with these compilers:
  43  * GNU
  44  * Intel
  45  * Pathscale
  46  * All these support GCC-style inline assembly.
  47  * We also use this section for the documentation.
  48  */
  49
  50
  51 #if 0
  52 /* Only gcc and Intel support this check, otherwise set it to true (skip doc) */
  53 #if (!defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined DOXYGEN)
  54 #define __builtin_constant_p(i) (1)
  55 #endif
  56 #endif
  57
  58 /* we put all of these on their own cache line by padding the data structure
  59    to the size of a cache line on x86 (64 bytes): */
  60 #define TMPI_SIZEOF_X86_CACHE_LINE 64
  61 typedef struct tMPI_Atomic
  62 {
  63     int value;
  64     char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(int)];
  65 } tMPI_Atomic_t;
  66
  67 typedef struct tMPI_Atomic_ptr
  68 {
  69     void* value;
  70     char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(void*)];
  71 } tMPI_Atomic_ptr_t;
  72
  73 typedef struct tMPI_Spinlock
  74 {
  75     unsigned int lock;
  76     char padding[TMPI_SIZEOF_X86_CACHE_LINE-sizeof(unsigned int)];
  77 } tMPI_Spinlock_t;
  78
  79
  80 #define TMPI_SPINLOCK_INITIALIZER   { 0 }
  81
  82
  83
  84 /* these are guaranteed to be  atomic on x86 and x86_64 */
  85 #define tMPI_Atomic_get(a)  ((a)->value)
  86 #define tMPI_Atomic_set(a,i)  (((a)->value) = (i))
  87
  88 #define tMPI_Atomic_ptr_get(a)  ((a)->value)
  89 #define tMPI_Atomic_ptr_set(a,i)  (((a)->value) = (void*)(i))
  90
  91
  92 /* do the intrinsics.
  93
  94    We disable this for 32-bit builds because the target may be 80386,
  95    which didn't have cmpxchg, etc (they were introduced as only as 'recently'
  96    as the 486, and gcc on some Linux versions still target 80386 by default).
  97
  98    We also specifically check for icc, because intrinsics are not always
  99    supported there.
 100
 101    llvm has issues with inline assembly and also in 32 bits has support for
 102    the gcc intrinsics */
 103 #if ( ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) &&  \
 104       !defined(__INTEL_COMPILER) )  || defined(__llvm__) )
 105 #include "gcc_intrinsics.h"
 106
 107 #else
 108 /* older versions of gcc don't support atomic intrinsics */
 109
 110
 111 #define tMPI_Atomic_memory_barrier() __asm__ __volatile__("sfence;": : :"memory")
 112
 113 static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
 114 {
 115     int __i;
 116
 117     __i = i;
 118     __asm__ __volatile__("lock ; xaddl %0, %1;"
 119                          :"=r"(i) :"m"(a->value), "0"(i) : "memory");
 120     return i + __i;
 121 }
 122
 123 static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
 124 {
 125     __asm__ __volatile__("lock ; xaddl %0, %1;"
 126                          :"=r"(i) :"m"(a->value), "0"(i) : "memory");
 127     return i;
 128 }
 129
 130 static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
 131 {
 132     int prev;
 133
 134     __asm__ __volatile__("lock ; cmpxchgl %1,%2"
 135                          : "=a"(prev)
 136                          : "q"(newval), "m"(a->value), "0"(oldval)
 137                          : "memory");
 138
 139     return prev==oldval;
 140 }
 141
 142 static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t *a,
 143                                       void *oldval,
 144                                       void *newval)
 145 {
 146     void* prev;
 147 #ifndef __x86_64__
 148     __asm__ __volatile__("lock ; cmpxchgl %1,%2"
 149                          : "=a"(prev)
 150                          : "q"(newval), "m"(a->value), "0"(oldval)
 151                          : "memory");
 152 #else
 153     __asm__ __volatile__("lock ; cmpxchgq %1,%2"
 154                          : "=a"(prev)
 155                          : "q"(newval), "m"(a->value), "0"(oldval)
 156                          : "memory");
 157 #endif
 158     return prev==oldval;
 159 }
 160
 161 #endif /* end of check for gcc intrinsics */
 162
 163 #define TMPI_HAVE_SWAP
 164 /* do the swap fns; we told the intrinsics that we have them. */
 165 static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
 166 {
 167     volatile int ret=b;
 168     __asm__ __volatile__("\txchgl %0, %1;"
 169                          :"+r"(ret), "+m"(a->value)
 170                          :
 171                          :"memory");
 172     return (int)ret;
 173 }
 174
 175 static inline void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
 176 {
 177     void *volatile *ret=(void* volatile*)b;
 178 #ifndef __x86_64__
 179 /*    __asm__ __volatile__("\txchgl %0, %1;"
 180                          :"=m"(a->value),"=q"(b)
 181                          :"q"(b)
 182                          :"memory");
 183 */
 184     __asm__ __volatile__("\txchgl %0, %1;"
 185                          :"+r"(ret), "+m"(a->value)
 186                          :
 187                          :"memory");
 188
 189 #else
 190     __asm__ __volatile__("\txchgq %0, %1;"
 191                          :"+r"(ret), "+m"(a->value)
 192                          :
 193                          :"memory");
 194 #endif
 195     return (void*)ret;
 196 }
 197
 198
 199
 200 /* spinlocks : */
 201
 202 static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
 203 {
 204     x->lock = 0;
 205 }
 206
 207
 208
 209 static inline void tMPI_Spinlock_lock(tMPI_Spinlock_t *x)
 210 {
 211     /* this is a spinlock with a double loop, as recommended by Intel
 212        it pauses in the outer loop (the one that just checks for the
 213        availability of the lock), and thereby reduces bus contention and
 214        prevents the pipeline from flushing. */
 215     __asm__ __volatile__("1:\tcmpl $0, %0\n"      /* check the lock */
 216                          "\tje 2f\n"              /* try to lock if it is
 217                                                      free by jumping forward */
 218                          "\tpause\n"              /* otherwise: small pause
 219                                                      as recommended by Intel */
 220                          "\tjmp 1b\n"             /* and jump back */
 221
 222                          "2:\tmovl $1, %%eax\n"   /* set eax to 1, the locked
 223                                                      value of the lock */
 224                          "\txchgl %%eax, %0\n"    /* atomically exchange
 225                                                      eax with the lock value */
 226                          "\tcmpl $0, %%eax\n"     /* compare the exchanged
 227                                                      value with 0 */
 228                          "\tjne 1b"               /* jump backward if we didn't
 229                                                      just lock */
 230                          : "+m" (x->lock)         /* input & output var */
 231                          :
 232                          : "%eax", "memory"/* we changed memory */
 233                         );
 234 }
 235
 236
 237
 238 static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *  x)
 239 {
 240     /* this is apparently all that is needed for unlocking a lock */
 241     __asm__ __volatile__(
 242                      "\n\tmovl $0, %0\n"
 243                      : "=m"(x->lock) : : "memory" );
 244 }
 245
 246
 247
 248 static inline int tMPI_Spinlock_trylock(tMPI_Spinlock_t *x)
 249 {
 250     int old_value=1;
 251
 252     __asm__ __volatile__("\tmovl %2, %0\n"     /* set eax to 1, the locked
 253                                                   value of the lock */
 254                          "\txchgl %0, %1\n"    /* atomically exchange
 255                                                   eax with the address in
 256                                                   rdx. */
 257                          : "+r"(old_value), "+m" (x->lock)
 258                          : "i" (1)
 259                          : "memory");
 260     return (old_value);
 261 }
 262
 263
 264
 265 static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
 266 {
 267     return ( (*((volatile int*)(&(x->lock)))) != 0);
 268 }
 269
 270
 271 static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
 272 {
 273     /* this is the spinlock without the xchg.  */
 274     __asm__ __volatile__("1:\tcmpl $0, %0\n"      /* check the lock */
 275                          "\tje 2f\n"              /* try to lock if it is
 276                                                      free by jumping forward */
 277                          "\tpause\n"              /* otherwise: small pause
 278                                                      as recommended by Intel */
 279                          "\tjmp 1b\n"             /* and jump back */
 280                          "2:\tnop\n"              /* jump target for end
 281                                                      of wait */
 282                          : "+m"(x->lock)         /* input & output var */
 283                          :
 284                          : "memory"/* we changed memory */
 285                         );
 286 #if 0
 287     do
 288     {
 289         tMPI_Atomic_memory_barrier();
 290     }
 291     while(tMPI_Spinlock_islocked(x));
 292 #endif
 293 }
 294