3 #ifdef HAVE_TMPI_CONFIG_H
4 #include "tmpi_config.h"
26 #if !(defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) || defined (__CYGWIN__) || defined (__CYGWIN32__)
29 /* We don't have specific NUMA aware allocators: */
31 void *tMPI_Malloc_local(size_t size)
36 void *tMPI_Calloc_local(size_t nmemb, size_t size)
38 return calloc(nmemb, size);
41 void *tMPI_Realloc_local(void *ptr, size_t size)
43 return realloc(ptr, size);
46 int tMPI_Free_numa(void *ptr)
49 return 0; /* we don't detect errors here */
54 #define TMPI_NUMA_MALLOC
57 Windows NUMA memory allocation support.
59 NUMA support is implemented to maximize the chance that memory access
60 patterns remain Local to the NUMA node. This avoids penalties accessing
62 An important assumption here is that code paths which allocate and
63 reallocate heap blocks are likely to be accessing that allocated memory
64 on the same NUMA node.
65 Testing has shown the above criteria to be met, yielding gains of > 15%
66 when on Windows with NUMA hardware.
68 The high level approach is:
69 1. Use a separate heap per NUMA node. This reduces heap contention, steers
70 allocations to the local NUMA node, and avoids re-use of freed heap
71 blocks across (remote) NUMA nodes.
72 2. Allocate each heap locally to each NUMA node, such that heap control
73 structures are on the NUMA node accessing the heap.
74 3. During realloc operations, transfer the new block to the local NUMA
75 node, if appropriate. This is a rare case when thread affinity and
76 access patterns are correct.
77 4. Use GetProcAddress() to obtain function pointers to functions that are
78 operating system version dependent, to allow maximum binary
81 Scott Field (sfield@microsoft.com) Jan-2011
84 //#define _WIN32_WINNT 0x0601
90 __declspec(align()) may not be supported by all compilers, so define the
91 size of the structure manually to force alignment
92 note that HeapAlloc() already returns aligned blocks.
93 typedef __declspec(align(32)) struct ...
98 size_t cbAllocationSize; /* 8 */
99 ULONG ProcessorNumber; /* processor number at time of allocation
100 (development/testing) */
101 USHORT NodeNumber; /* NUMA node number at time of allocation
102 (development/testing) */
103 } HEAPHEADER, *PHEAPHEADER;
105 #define HEAP_NUMA_MAGIC 0x05AF0777
106 #define HEAPHEADER_SIZE (32)
109 /* fail compile if size of HEAPHEADER exceeds pre-defined value */
110 C_ASSERT(sizeof(HEAPHEADER) <= HEAPHEADER_SIZE);
113 /* function prototypes and variables to support obtaining function
114 addresses dynamically -- supports down-level operating systems */
116 typedef BOOL (WINAPI *func_GetNumaHighestNodeNumber_t)( PULONG HighestNodeNumber );
117 typedef BOOL (WINAPI *func_GetNumaProcessorNodeEx_t)( PPROCESSOR_NUMBER Processor, PUSHORT NodeNumber );
118 typedef VOID (WINAPI *func_GetCurrentProcessorNumberEx_t)( PPROCESSOR_NUMBER ProcNumber );
120 func_GetNumaHighestNodeNumber_t smalloc_GetNumaHighestNodeNumber; /* WinXP SP2, WinXP64, WinSrv 2003 */
121 func_GetNumaProcessorNodeEx_t smalloc_GetNumaProcessorNodeEx; /* Windows 7, WinSrv 2008R2 */
122 func_GetCurrentProcessorNumberEx_t smalloc_GetCurrentProcessorNumberEx; /* Windows 7, WinSrv 2008R2 */
124 #define NUMA_STATUS_UNKNOWN (0)
125 #define NUMA_STATUS_NOT_NUMA (1)
126 #define NUMA_STATUS_NUMA (2)
128 DWORD g_dwTlsHeap; /* TLS slot used for preferred heap handle */
129 HANDLE *g_hHeap; /* array of heap handles */
130 ULONG g_ulNumaStatus; /* 0 = unknown, 1 = not NUMA, 2 = NUMA */
137 HMODULE hModKernel32; /* module handle to kernel32.dll -- we already
138 reference it, so it's already loaded */
139 ULONG ulNumaHighestNodeNumber;
141 /* grab the addresses for the NUMA functions.
142 It's fine if there is a race condition reaching this routine */
143 hModKernel32 = GetModuleHandleA("kernel32.dll");
144 if (hModKernel32 == NULL)
146 g_ulNumaStatus = NUMA_STATUS_NOT_NUMA;
150 smalloc_GetNumaHighestNodeNumber = (func_GetNumaHighestNodeNumber_t)GetProcAddress( hModKernel32, "GetNumaHighestNodeNumber" );
151 smalloc_GetCurrentProcessorNumberEx = (func_GetCurrentProcessorNumberEx_t)GetProcAddress( hModKernel32, "GetCurrentProcessorNumberEx" );
152 smalloc_GetNumaProcessorNodeEx = (func_GetNumaProcessorNodeEx_t)GetProcAddress( hModKernel32, "GetNumaProcessorNodeEx" );
154 if ( (smalloc_GetNumaHighestNodeNumber == NULL) ||
155 (smalloc_GetCurrentProcessorNumberEx == NULL) ||
156 (smalloc_GetNumaProcessorNodeEx == NULL) )
158 g_ulNumaStatus = NUMA_STATUS_NOT_NUMA;
162 /* determine how many NUMA nodes are present */
164 if (!smalloc_GetNumaHighestNodeNumber(&ulNumaHighestNodeNumber) ||
165 (ulNumaHighestNodeNumber == 0) )
167 g_ulNumaStatus = NUMA_STATUS_NOT_NUMA;
171 /* handle deferred creation of TLS slot.
172 note: this could be moved to one-time init path.
173 failures here result in assuming the system is not NUMA.
176 if (g_dwTlsHeap == 0)
178 DWORD dwTlsHeap = TlsAlloc();
181 if (dwTlsHeap == TLS_OUT_OF_INDEXES)
183 g_ulNumaStatus = NUMA_STATUS_NOT_NUMA;
187 dwPriorValue = (DWORD)InterlockedCompareExchange(
188 (LONG volatile *)&g_dwTlsHeap,
193 if (dwPriorValue != 0)
195 TlsFree( dwTlsHeap );
199 /* handle deferred creation of heap handle array.
200 note: this could be moved to one-time init path.
208 /* allocate an array to contain a heap handle for each NUMA node */
209 hHeapNew = (HANDLE*)HeapAlloc(
212 sizeof(HANDLE) * (ulNumaHighestNodeNumber+1)
215 if (hHeapNew == NULL)
217 g_ulNumaStatus = NUMA_STATUS_NOT_NUMA;
221 hPriorValue = (HANDLE *)InterlockedCompareExchange(
222 (LONG volatile *)&g_hHeap,
227 if (hPriorValue != NULL)
229 HeapFree(GetProcessHeap(), 0, hHeapNew);
233 /* indicate system is NUMA */
234 g_ulNumaStatus = NUMA_STATUS_NUMA;
244 HANDLE hHeap; /* preferred heap handle to
246 PROCESSOR_NUMBER CurrentProcessorNumber; /* processor number associated
247 with calling thread */
248 USHORT CurrentNumaNodeNumber; /* NUMA node number assocaited
249 with calling thread */
251 /* determine NUMA status of system. */
253 if (g_ulNumaStatus == NUMA_STATUS_UNKNOWN)
255 InitNumaHeapSupport();
256 if (g_ulNumaStatus == NUMA_STATUS_NOT_NUMA)
258 return GetProcessHeap();
261 else if (g_ulNumaStatus == NUMA_STATUS_NOT_NUMA)
263 /* not NUMA, return the process heap handle */
264 return GetProcessHeap();
268 /* return the preferred heap handle from the TLS slot, if set.
269 This is the commonly taken path. */
271 hHeap = (HANDLE)TlsGetValue( g_dwTlsHeap );
279 /* preferred heap handle not yet set.
280 determine the numa node we're executing on, and create a heap which
281 is assigned to this node.
282 one (soft) assumption that is made here is that thread affinity has
283 been set such that threads do not move between NUMA nodes.
286 smalloc_GetCurrentProcessorNumberEx(&CurrentProcessorNumber);
288 if (!smalloc_GetNumaProcessorNodeEx(&CurrentProcessorNumber, &CurrentNumaNodeNumber))
290 /* GetNumaProcessorNodeEx() can fail on WOW64/32bit if invoked
291 against processor numbers > 32.
292 this should never be reached for 64bit builds.
294 CurrentNumaNodeNumber = 0;
298 /* check if the NUMA node array slot already contains a heap */
299 /* CurrentNumaNodeNumber cannot execeed count of heaps, as NUMA nodes
302 hHeap = g_hHeap[ CurrentNumaNodeNumber ];
306 HANDLE hHeapPrior = NULL;
307 ULONG ulOption = 2; /* HEAP_LFH */
309 /* create a heap for this numa node
310 defer creating the heap - while running on each node - to ensure
311 the heap control structures get created on the local NUMA node.
314 hHeap = HeapCreate(0, 0, 0);
318 /* just return the process heap. We'll try to create a heap
320 return GetProcessHeap();
323 /* make the new heap a low-fragmentation heap */
327 0, /* HeapCompatibilityInformation */
332 /* set the array slot entry for this NUMA node to contain the newly
335 hHeapPrior = (HANDLE)InterlockedCompareExchangePointer(&(g_hHeap[CurrentNumaNodeNumber]), hHeap, NULL);
336 if (hHeapPrior != NULL)
338 HeapDestroy( hHeap );
343 /* we reached here since there was no heap assigned to the TLS slot.
345 TlsSetValue(g_dwTlsHeap, hHeap);
353 void *tMPI_Malloc_local(size_t size)
360 hHeap = ReturnHeapHandle();
362 new_size = size + HEAPHEADER_SIZE;
364 ptr = (unsigned char *)HeapAlloc( hHeap, 0, new_size );
371 phdr = (HEAPHEADER*)ptr;
373 phdr->dwMagic = HEAP_NUMA_MAGIC;
374 phdr->hHeap = hHeap; /* track the heap handle for realloc
376 phdr->cbAllocationSize = new_size; /* track the allocation size for
377 realloc and debugging */
379 return ( ptr + HEAPHEADER_SIZE );
383 void *tMPI_Calloc_local(size_t nelem, size_t elsize)
386 size_t size = nelem * elsize;
388 ptr = tMPI_Malloc_local(size);
392 memset(ptr, 0, size);
399 void *tMPI_Realloc_local(void *ptr, size_t size)
403 unsigned char *new_ptr;
406 /* calculate the allocation address and check for presence of the hint
407 which indicates this was allocated by our allocator.
410 phdr = (HEAPHEADER*)((unsigned char*)ptr - HEAPHEADER_SIZE);
412 if (phdr->dwMagic != HEAP_NUMA_MAGIC)
414 /* TODO: call tMPI_Error() */
415 /*gmx_fatal(errno,__FILE__,__LINE__,
416 "Invalid Heap header during realloc. %p", ptr);*/
421 /* calculate size of new/realloc'd block.
424 new_size = size + HEAPHEADER_SIZE;
426 /* if the NUMA Node changed between the initial allocation and the
427 reallocation, copy the memory to an allocation on the new local node:
428 we assume the new realloc'd block is more likely to be manipulated by
429 the current thread which is calling realloc.
430 the simple way to detect this condition is to compare the preferred heap
431 handle with the heap handle stored in the current memory block.
434 hHeap = ReturnHeapHandle();
436 if (hHeap != phdr->hHeap)
438 new_ptr = HeapAlloc( hHeap, 0, new_size );
440 /* if the new allocation succeeded, copy the buffer and free the
448 /* the realloc can be larger or smaller than the original
452 if (new_size > phdr->cbAllocationSize)
454 copy_size = phdr->cbAllocationSize;
458 copy_size = new_size;
461 /* copy the current memory block contents into the newly allocated
462 buffer, and then free the original buffer.
465 memcpy( new_ptr, phdr, copy_size );
467 HeapFree( phdr->hHeap, 0, phdr );
474 /* NodeNumber of existing allocation matches current node number.
475 realloc from the heap associated with the existing allocation.
480 new_ptr = HeapReAlloc(
493 phdr = (HEAPHEADER*)new_ptr;
494 phdr->cbAllocationSize = new_size; /* update allocation size to match
498 return ( new_ptr + HEAPHEADER_SIZE );
502 int tMPI_Free_numa(void *ptr)
506 /* caller doesn't call us on ptr == NULL case, so we don't need to
509 phdr = (HEAPHEADER*)((unsigned char*)ptr - HEAPHEADER_SIZE);
511 /* this check should happen in __try / __except block until the
512 mis-matched free/sfree calls are fixed, but this is here primarilly
513 for debugging and catching mis-matched memory alloc and free references.
516 if (phdr->dwMagic != HEAP_NUMA_MAGIC)
518 /* ptr is leaked here, rather than faulting in the allocator.
519 this is done in order to track mis-matched alloc/free calls.
526 HeapFree( phdr->hHeap, 0, phdr );
531 #endif /* NUMA allocation functions for (_WIN32 || _WIN64) */