3 #ifdef HAVE_TMPI_CONFIG_H
4 #include "tmpi_config.h"
26 #if !(defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) || defined (__CYGWIN__) || defined (__CYGWIN32__)
29 /* We don't have specific NUMA aware allocators: */
31 void *tMPI_Malloc_local(size_t size)
36 void *tMPI_Calloc_local(size_t nmemb, size_t size)
38 return calloc(nmemb, size);
41 void *tMPI_Realloc_local(void *ptr, size_t size)
43 return realloc(ptr, size);
46 int tMPI_Free_numa(void *ptr)
49 return 0; /* we don't detect errors here */
54 #define TMPI_NUMA_MALLOC
57 Windows NUMA memory allocation support.
59 NUMA support is implemented to maximize the chance that memory access
60 patterns remain Local to the NUMA node. This avoids penalties accessing
62 An important assumption here is that code paths which allocate and
63 reallocate heap blocks are likely to be accessing that allocated memory
64 on the same NUMA node.
65 Testing has shown the above criteria to be met, yielding gains of > 15%
66 when on Windows with NUMA hardware.
68 The high level approach is:
69 1. Use a separate heap per NUMA node. This reduces heap contention, steers
70 allocations to the local NUMA node, and avoids re-use of freed heap
71 blocks across (remote) NUMA nodes.
72 2. Allocate each heap locally to each NUMA node, such that heap control
73 structures are on the NUMA node accessing the heap.
74 3. During realloc operations, transfer the new block to the local NUMA
75 node, if appropriate. This is a rare case when thread affinity and
76 access patterns are correct.
77 4. Use GetProcAddress() to obtain function pointers to functions that are
78 operating system version dependent, to allow maximum binary
81 Scott Field (sfield@microsoft.com) Jan-2011
84 //#define _WIN32_WINNT 0x0601
90 __declspec(align()) may not be supported by all compilers, so define the
91 size of the structure manually to force alignment
92 note that HeapAlloc() already returns aligned blocks.
93 typedef __declspec(align(32)) struct ...
98 size_t cbAllocationSize; /* 8 */
99 ULONG ProcessorNumber; /* processor number at time of allocation
100 (development/testing) */
101 USHORT NodeNumber; /* NUMA node number at time of allocation
102 (development/testing) */
103 } HEAPHEADER, *PHEAPHEADER;
105 #define HEAP_NUMA_MAGIC 0x05AF0777
106 #define HEAPHEADER_SIZE (32)
109 /* fail compile if size of HEAPHEADER exceeds pre-defined value */
110 C_ASSERT(sizeof(HEAPHEADER) <= HEAPHEADER_SIZE);
113 /* function prototypes and variables to support obtaining function
114 addresses dynamically -- supports down-level operating systems */
116 typedef BOOL (WINAPI *func_GetNumaHighestNodeNumber_t)( PULONG HighestNodeNumber );
117 typedef BOOL (WINAPI *func_GetNumaProcessorNodeEx_t)( PPROCESSOR_NUMBER Processor, PUSHORT NodeNumber );
118 typedef VOID (WINAPI *func_GetCurrentProcessorNumberEx_t)( PPROCESSOR_NUMBER ProcNumber );
120 func_GetNumaHighestNodeNumber_t smalloc_GetNumaHighestNodeNumber; /* WinXP SP2, WinXP64, WinSrv 2003 */
121 func_GetNumaProcessorNodeEx_t smalloc_GetNumaProcessorNodeEx; /* Windows 7, WinSrv 2008R2 */
122 func_GetCurrentProcessorNumberEx_t smalloc_GetCurrentProcessorNumberEx; /* Windows 7, WinSrv 2008R2 */
124 #define NUMA_STATUS_UNKNOWN (0)
125 #define NUMA_STATUS_NOT_NUMA (1)
126 #define NUMA_STATUS_NUMA (2)
128 DWORD g_dwTlsHeap; /* TLS slot used for preferred heap handle */
129 HANDLE *g_hHeap; /* array of heap handles */
130 ULONG g_ulNumaStatus; /* 0 = unknown, 1 = not NUMA, 2 = NUMA */
137 HMODULE hModKernel32; /* module handle to kernel32.dll -- we already
138 reference it, so it's already loaded */
139 ULONG ulNumaHighestNodeNumber;
141 /* grab the addresses for the NUMA functions.
142 It's fine if there is a race condition reaching this routine */
143 hModKernel32 = GetModuleHandleA("kernel32.dll");
144 if(hModKernel32 == NULL)
146 g_ulNumaStatus = NUMA_STATUS_NOT_NUMA;
150 smalloc_GetNumaHighestNodeNumber = (func_GetNumaHighestNodeNumber_t)GetProcAddress( hModKernel32, "GetNumaHighestNodeNumber" );
151 smalloc_GetCurrentProcessorNumberEx = (func_GetCurrentProcessorNumberEx_t)GetProcAddress( hModKernel32, "GetCurrentProcessorNumberEx" );
152 smalloc_GetNumaProcessorNodeEx = (func_GetNumaProcessorNodeEx_t)GetProcAddress( hModKernel32, "GetNumaProcessorNodeEx" );
154 if( (smalloc_GetNumaHighestNodeNumber == NULL) ||
155 (smalloc_GetCurrentProcessorNumberEx == NULL) ||
156 (smalloc_GetNumaProcessorNodeEx == NULL) )
158 g_ulNumaStatus = NUMA_STATUS_NOT_NUMA;
162 /* determine how many NUMA nodes are present */
164 if(!smalloc_GetNumaHighestNodeNumber(&ulNumaHighestNodeNumber) ||
165 (ulNumaHighestNodeNumber == 0) )
167 g_ulNumaStatus = NUMA_STATUS_NOT_NUMA;
171 /* handle deferred creation of TLS slot.
172 note: this could be moved to one-time init path.
173 failures here result in assuming the system is not NUMA.
176 if( g_dwTlsHeap == 0 )
178 DWORD dwTlsHeap = TlsAlloc();
181 if( dwTlsHeap == TLS_OUT_OF_INDEXES )
183 g_ulNumaStatus = NUMA_STATUS_NOT_NUMA;
187 dwPriorValue = (DWORD)InterlockedCompareExchange(
188 (LONG volatile *)&g_dwTlsHeap,
193 if( dwPriorValue != 0 )
195 TlsFree( dwTlsHeap );
199 /* handle deferred creation of heap handle array.
200 note: this could be moved to one-time init path.
203 if( g_hHeap == NULL )
208 /* allocate an array to contain a heap handle for each NUMA node */
209 hHeapNew = (HANDLE*)HeapAlloc(
212 sizeof(HANDLE) * (ulNumaHighestNodeNumber+1)
215 if( hHeapNew == NULL )
217 g_ulNumaStatus = NUMA_STATUS_NOT_NUMA;
221 hPriorValue = (HANDLE *)InterlockedCompareExchange(
222 (LONG volatile *)&g_hHeap,
227 if( hPriorValue != NULL )
229 HeapFree(GetProcessHeap(), 0, hHeapNew);
233 /* indicate system is NUMA */
234 g_ulNumaStatus = NUMA_STATUS_NUMA;
244 HANDLE hHeap; /* preferred heap handle to
246 PROCESSOR_NUMBER CurrentProcessorNumber; /* processor number associated
247 with calling thread */
248 USHORT CurrentNumaNodeNumber; /* NUMA node number assocaited
249 with calling thread */
251 /* determine NUMA status of system. */
253 if( g_ulNumaStatus == NUMA_STATUS_UNKNOWN )
255 InitNumaHeapSupport();
256 if( g_ulNumaStatus == NUMA_STATUS_NOT_NUMA )
258 return GetProcessHeap();
260 } else if( g_ulNumaStatus == NUMA_STATUS_NOT_NUMA )
262 /* not NUMA, return the process heap handle */
263 return GetProcessHeap();
267 /* return the preferred heap handle from the TLS slot, if set.
268 This is the commonly taken path. */
270 hHeap = (HANDLE)TlsGetValue( g_dwTlsHeap );
278 /* preferred heap handle not yet set.
279 determine the numa node we're executing on, and create a heap which
280 is assigned to this node.
281 one (soft) assumption that is made here is that thread affinity has
282 been set such that threads do not move between NUMA nodes.
285 smalloc_GetCurrentProcessorNumberEx(&CurrentProcessorNumber);
287 if(!smalloc_GetNumaProcessorNodeEx(&CurrentProcessorNumber, &CurrentNumaNodeNumber))
289 /* GetNumaProcessorNodeEx() can fail on WOW64/32bit if invoked
290 against processor numbers > 32.
291 this should never be reached for 64bit builds.
293 CurrentNumaNodeNumber = 0;
297 /* check if the NUMA node array slot already contains a heap */
298 /* CurrentNumaNodeNumber cannot execeed count of heaps, as NUMA nodes
301 hHeap = g_hHeap[ CurrentNumaNodeNumber ];
305 HANDLE hHeapPrior = NULL;
306 ULONG ulOption = 2; /* HEAP_LFH */
308 /* create a heap for this numa node
309 defer creating the heap - while running on each node - to ensure
310 the heap control structures get created on the local NUMA node.
313 hHeap = HeapCreate(0, 0, 0);
317 /* just return the process heap. We'll try to create a heap
319 return GetProcessHeap();
322 /* make the new heap a low-fragmentation heap */
326 0, /* HeapCompatibilityInformation */
331 /* set the array slot entry for this NUMA node to contain the newly
334 hHeapPrior = (HANDLE)InterlockedCompareExchangePointer(&(g_hHeap[CurrentNumaNodeNumber]), hHeap, NULL);
335 if( hHeapPrior != NULL )
337 HeapDestroy( hHeap );
342 /* we reached here since there was no heap assigned to the TLS slot.
344 TlsSetValue(g_dwTlsHeap, hHeap);
352 void *tMPI_Malloc_local(size_t size)
359 hHeap = ReturnHeapHandle();
361 new_size = size + HEAPHEADER_SIZE;
363 ptr = (unsigned char *)HeapAlloc( hHeap, 0, new_size );
370 phdr = (HEAPHEADER*)ptr;
372 phdr->dwMagic = HEAP_NUMA_MAGIC;
373 phdr->hHeap = hHeap; /* track the heap handle for realloc
375 phdr->cbAllocationSize = new_size; /* track the allocation size for
376 realloc and debugging */
378 return ( ptr + HEAPHEADER_SIZE );
382 void *tMPI_Calloc_local(size_t nelem, size_t elsize)
385 size_t size = nelem * elsize;
387 ptr = tMPI_Malloc_local(size);
391 memset(ptr, 0, size);
398 void *tMPI_Realloc_local(void *ptr, size_t size)
402 unsigned char *new_ptr;
405 /* calculate the allocation address and check for presence of the hint
406 which indicates this was allocated by our allocator.
409 phdr = (HEAPHEADER*)((unsigned char*)ptr - HEAPHEADER_SIZE);
411 if( phdr->dwMagic != HEAP_NUMA_MAGIC )
413 /* TODO: call tMPI_Error() */
414 /*gmx_fatal(errno,__FILE__,__LINE__,
415 "Invalid Heap header during realloc. %p", ptr);*/
420 /* calculate size of new/realloc'd block.
423 new_size = size + HEAPHEADER_SIZE;
425 /* if the NUMA Node changed between the initial allocation and the
426 reallocation, copy the memory to an allocation on the new local node:
427 we assume the new realloc'd block is more likely to be manipulated by
428 the current thread which is calling realloc.
429 the simple way to detect this condition is to compare the preferred heap
430 handle with the heap handle stored in the current memory block.
433 hHeap = ReturnHeapHandle();
435 if( hHeap != phdr->hHeap )
437 new_ptr = HeapAlloc( hHeap, 0, new_size );
439 /* if the new allocation succeeded, copy the buffer and free the
443 if( new_ptr != NULL )
447 /* the realloc can be larger or smaller than the original
451 if( new_size > phdr->cbAllocationSize )
453 copy_size = phdr->cbAllocationSize;
455 copy_size = new_size;
458 /* copy the current memory block contents into the newly allocated
459 buffer, and then free the original buffer.
462 memcpy( new_ptr, phdr, copy_size );
464 HeapFree( phdr->hHeap, 0, phdr );
469 /* NodeNumber of existing allocation matches current node number.
470 realloc from the heap associated with the existing allocation.
475 new_ptr = HeapReAlloc(
483 if( new_ptr == NULL )
488 phdr = (HEAPHEADER*)new_ptr;
489 phdr->cbAllocationSize = new_size; /* update allocation size to match
493 return ( new_ptr + HEAPHEADER_SIZE );
497 int tMPI_Free_numa(void *ptr)
501 /* caller doesn't call us on ptr == NULL case, so we don't need to
504 phdr = (HEAPHEADER*)((unsigned char*)ptr - HEAPHEADER_SIZE);
506 /* this check should happen in __try / __except block until the
507 mis-matched free/sfree calls are fixed, but this is here primarilly
508 for debugging and catching mis-matched memory alloc and free references.
511 if(phdr->dwMagic != HEAP_NUMA_MAGIC)
513 /* ptr is leaked here, rather than faulting in the allocator.
514 this is done in order to track mis-matched alloc/free calls.
521 HeapFree( phdr->hHeap, 0, phdr );
526 #endif /* NUMA allocation functions for (_WIN32 || _WIN64) */