diff --git a/client/tracy_rpmalloc.cpp b/client/tracy_rpmalloc.cpp new file mode 100644 index 00000000..c32f92c8 --- /dev/null +++ b/client/tracy_rpmalloc.cpp @@ -0,0 +1,1778 @@ +/* rpmalloc.c - Memory allocator - Public Domain - 2016 Mattias Jansson / Rampant Pixels + * + * This library provides a cross-platform lock free thread caching malloc implementation in C11. + * The latest source code is always available at + * + * https://github.com/rampantpixels/rpmalloc + * + * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. + * + */ + +#include "tracy_rpmalloc.h" + +// Build time configurable limits + +// Presets, if none is defined it will default to performance priority +//#define ENABLE_UNLIMITED_CACHE +//#define DISABLE_CACHE +//#define ENABLE_SPACE_PRIORITY_CACHE + +// Presets for cache limits +#if defined(ENABLE_UNLIMITED_CACHE) +// Unlimited caches +#define MIN_SPAN_CACHE_RELEASE 16 +#define MAX_SPAN_CACHE_DIVISOR 1 +#elif defined(DISABLE_CACHE) +//Disable cache +#define MIN_SPAN_CACHE_RELEASE 1 +#define MAX_SPAN_CACHE_DIVISOR 0 +#elif defined(ENABLE_SPACE_PRIORITY_CACHE) +// Space priority cache limits +#define MIN_SPAN_CACHE_SIZE 8 +#define MIN_SPAN_CACHE_RELEASE 8 +#define MAX_SPAN_CACHE_DIVISOR 16 +#define GLOBAL_SPAN_CACHE_MULTIPLIER 1 +#else +// Default - performance priority cache limits +//! Limit of thread cache in number of spans for each page count class (undefine for unlimited cache - i.e never release spans to global cache unless thread finishes) +//! Minimum cache size to remain after a release to global cache +#define MIN_SPAN_CACHE_SIZE 8 +//! Minimum number of spans to transfer between thread and global cache +#define MIN_SPAN_CACHE_RELEASE 16 +//! Maximum cache size divisor (max cache size will be max allocation count divided by this divisor) +#define MAX_SPAN_CACHE_DIVISOR 8 +//! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this) +#define GLOBAL_SPAN_CACHE_MULTIPLIER 4 +#endif + +//! Size of heap hashmap +#define HEAP_ARRAY_SIZE 79 + +#ifndef ENABLE_VALIDATE_ARGS +//! Enable validation of args to public entry points +#define ENABLE_VALIDATE_ARGS 0 +#endif + +#ifndef ENABLE_STATISTICS +//! Enable statistics collection +#define ENABLE_STATISTICS 0 +#endif + +#ifndef ENABLE_ASSERTS +//! Enable asserts +#define ENABLE_ASSERTS 0 +#endif + +// Platform and arch specifics + +#ifdef _MSC_VER +# define ALIGNED_STRUCT(name, alignment) __declspec(align(alignment)) struct name +# define FORCEINLINE __forceinline +# define TLS_MODEL +# define _Static_assert static_assert +# define _Thread_local __declspec(thread) +# define atomic_thread_fence_acquire() //_ReadWriteBarrier() +# define atomic_thread_fence_release() //_ReadWriteBarrier() +# if ENABLE_VALIDATE_ARGS +# include +# endif +#else +# define ALIGNED_STRUCT(name, alignment) struct __attribute__((__aligned__(alignment))) name +# define FORCEINLINE inline __attribute__((__always_inline__)) +# define TLS_MODEL __attribute__((tls_model("initial-exec"))) +# if !defined(__clang__) && defined(__GNUC__) +# define _Thread_local __thread +# endif +# ifdef __arm__ +# define atomic_thread_fence_acquire() __asm volatile("dmb sy" ::: "memory") +# define atomic_thread_fence_release() __asm volatile("dmb st" ::: "memory") +# else +# define atomic_thread_fence_acquire() //__asm volatile("" ::: "memory") +# define atomic_thread_fence_release() //__asm volatile("" ::: "memory") +# endif +#endif + +#if defined( __x86_64__ ) || defined( _M_AMD64 ) || defined( _M_X64 ) || defined( _AMD64_ ) || defined( __arm64__ ) || defined( __aarch64__ ) +# define ARCH_64BIT 1 +#else +# define ARCH_64BIT 0 +#endif + +#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 ) +# define PLATFORM_WINDOWS 1 +#else +# define PLATFORM_POSIX 1 +#endif + +#include +#include + +#if ENABLE_ASSERTS +# include +#else +# define assert(x) +#endif + +// Atomic access abstraction +ALIGNED_STRUCT(atomic32_t, 4) { + int32_t nonatomic; +}; +typedef struct atomic32_t atomic32_t; + +ALIGNED_STRUCT(atomic64_t, 8) { + int64_t nonatomic; +}; +typedef struct atomic64_t atomic64_t; + +ALIGNED_STRUCT(atomicptr_t, 8) { + void* nonatomic; +}; +typedef struct atomicptr_t atomicptr_t; + +static FORCEINLINE int32_t +atomic_load32(atomic32_t* src) { + return src->nonatomic; +} + +static FORCEINLINE void +atomic_store32(atomic32_t* dst, int32_t val) { + dst->nonatomic = val; +} + +#if PLATFORM_POSIX + +static FORCEINLINE void +atomic_store64(atomic64_t* dst, int64_t val) { + dst->nonatomic = val; +} + +static FORCEINLINE int64_t +atomic_exchange_and_add64(atomic64_t* dst, int64_t add) { + return __sync_fetch_and_add(&dst->nonatomic, add); +} + +#endif + +static FORCEINLINE int32_t +atomic_incr32(atomic32_t* val) { +#ifdef _MSC_VER + int32_t old = (int32_t)_InterlockedExchangeAdd((volatile long*)&val->nonatomic, 1); + return (old + 1); +#else + return __sync_add_and_fetch(&val->nonatomic, 1); +#endif +} + +static FORCEINLINE int32_t +atomic_add32(atomic32_t* val, int32_t add) { +#ifdef _MSC_VER + int32_t old = (int32_t)_InterlockedExchangeAdd((volatile long*)&val->nonatomic, add); + return (old + add); +#else + return __sync_add_and_fetch(&val->nonatomic, add); +#endif +} + +static FORCEINLINE void* +atomic_load_ptr(atomicptr_t* src) { + return src->nonatomic; +} + +static FORCEINLINE void +atomic_store_ptr(atomicptr_t* dst, void* val) { + dst->nonatomic = val; +} + +static FORCEINLINE int +atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref); + +static void +thread_yield(void); + +// Preconfigured limits and sizes + +//! Memory page size +#define PAGE_SIZE 4096 + +//! Granularity of all memory page spans for small & medium block allocations +#define SPAN_ADDRESS_GRANULARITY 65536 +//! Maximum size of a span of memory pages +#define SPAN_MAX_SIZE (SPAN_ADDRESS_GRANULARITY) +//! Mask for getting the start of a span of memory pages +#define SPAN_MASK (~((uintptr_t)SPAN_MAX_SIZE - 1)) +//! Maximum number of memory pages in a span +#define SPAN_MAX_PAGE_COUNT (SPAN_MAX_SIZE / PAGE_SIZE) +//! Span size class granularity +#define SPAN_CLASS_GRANULARITY 4 +//! Number of size classes for spans +#define SPAN_CLASS_COUNT (SPAN_MAX_PAGE_COUNT / SPAN_CLASS_GRANULARITY) + +//! Granularity of a small allocation block +#define SMALL_GRANULARITY 16 +//! Small granularity shift count +#define SMALL_GRANULARITY_SHIFT 4 +//! Number of small block size classes +#define SMALL_CLASS_COUNT (((PAGE_SIZE - SPAN_HEADER_SIZE) >> 1) >> SMALL_GRANULARITY_SHIFT) +//! Maximum size of a small block +#define SMALL_SIZE_LIMIT (SMALL_CLASS_COUNT * SMALL_GRANULARITY) + +//! Granularity of a medium allocation block +#define MEDIUM_GRANULARITY 512 +//! Medimum granularity shift count +#define MEDIUM_GRANULARITY_SHIFT 9 +//! Number of medium block size classes +#define MEDIUM_CLASS_COUNT 60 +//! Maximum size of a medium block +#define MEDIUM_SIZE_LIMIT (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT) - SPAN_HEADER_SIZE) + +//! Total number of small + medium size classes +#define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT) + +//! Number of large block size classes +#define LARGE_CLASS_COUNT 32 +//! Maximum number of memory pages in a large block +#define LARGE_MAX_PAGES (SPAN_MAX_PAGE_COUNT * LARGE_CLASS_COUNT) +//! Maximum size of a large block +#define LARGE_SIZE_LIMIT ((LARGE_MAX_PAGES * PAGE_SIZE) - SPAN_HEADER_SIZE) + +#define SPAN_LIST_LOCK_TOKEN ((void*)1) + +#define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs)) +#define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second)) + +//! Size of a span header +#define SPAN_HEADER_SIZE 32 + +#if ARCH_64BIT +typedef int64_t offset_t; +#else +typedef int32_t offset_t; +#endif +typedef uint32_t count_t; + +#if ENABLE_VALIDATE_ARGS +//! Maximum allocation size to avoid integer overflow +#define MAX_ALLOC_SIZE (((size_t)-1) - PAGE_SIZE) +#endif + +// Data types + +//! A memory heap, per thread +typedef struct heap_t heap_t; +//! Span of memory pages +typedef struct span_t span_t; +//! Size class definition +typedef struct size_class_t size_class_t; +//! Span block bookkeeping +typedef struct span_block_t span_block_t; +//! Span data union, usage depending on span state +typedef union span_data_t span_data_t; +//! Cache data +typedef struct span_counter_t span_counter_t; + +struct span_block_t { + //! Free list + uint16_t free_list; + //! First autolinked block + uint16_t first_autolink; + //! Free count + uint16_t free_count; + //! Padding + uint16_t padding; +}; + +union span_data_t { + //! Span data + span_block_t block; + //! List size (used when span is part of a list) + uint32_t list_size; +}; + +struct span_t { + //! Heap ID + atomic32_t heap_id; + //! Size class + count_t size_class; + //! Span data + span_data_t data; + //! Next span + span_t* next_span; + //! Previous span + span_t* prev_span; +}; +_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); + +struct span_counter_t { + //! Allocation high water mark + uint32_t max_allocations; + //! Current number of allocations + uint32_t current_allocations; + //! Cache limit + uint32_t cache_limit; +}; + +struct heap_t { + //! Heap ID + int32_t id; + //! Deferred deallocation + atomicptr_t defer_deallocate; + //! Free count for each size class active span + span_block_t active_block[SIZE_CLASS_COUNT]; + //! Active span for each size class + span_t* active_span[SIZE_CLASS_COUNT]; + //! List of demi-used spans with free blocks for each size class (double linked list) + span_t* size_cache[SIZE_CLASS_COUNT]; + //! List of free spans for each page count (single linked list) + span_t* span_cache[SPAN_CLASS_COUNT]; + //! Allocation counters + span_counter_t span_counter[SPAN_CLASS_COUNT]; + //! List of free spans for each large class count (single linked list) + span_t* large_cache[LARGE_CLASS_COUNT]; + //! Allocation counters for large blocks + span_counter_t large_counter[LARGE_CLASS_COUNT]; + //! Next heap in id list + heap_t* next_heap; + //! Next heap in orphan list + heap_t* next_orphan; +#if ENABLE_STATISTICS + //! Number of bytes currently reqeusted in allocations + size_t requested; + //! Number of bytes current allocated + size_t allocated; + //! Number of bytes transitioned thread -> global + size_t thread_to_global; + //! Number of bytes transitioned global -> thread + size_t global_to_thread; +#endif +}; +_Static_assert(sizeof(heap_t) <= PAGE_SIZE*2, "heap size mismatch"); + +struct size_class_t { + //! Size of blocks in this class + uint16_t size; + //! Number of pages to allocate for a chunk + uint16_t page_count; + //! Number of blocks in each chunk + uint16_t block_count; + //! Class index this class is merged with + uint16_t class_idx; +}; +_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); + +//! Global size classes +static size_class_t _memory_size_class[SIZE_CLASS_COUNT]; + +//! Heap ID counter +static atomic32_t _memory_heap_id; + +#ifdef PLATFORM_POSIX +//! Virtual memory address counter +static atomic64_t _memory_addr; +#endif + +//! Global span cache +static atomicptr_t _memory_span_cache[SPAN_CLASS_COUNT]; + +//! Global large cache +static atomicptr_t _memory_large_cache[LARGE_CLASS_COUNT]; + +//! Current thread heap +static _Thread_local heap_t* _memory_thread_heap TLS_MODEL; + +//! All heaps +static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE]; + +//! Orphaned heaps +static atomicptr_t _memory_orphan_heaps; + +//! Active heap count +static atomic32_t _memory_active_heaps; + +//! Adaptive cache max allocation count +static uint32_t _memory_max_allocation[SPAN_CLASS_COUNT]; + +//! Adaptive cache max allocation count +static uint32_t _memory_max_allocation_large[LARGE_CLASS_COUNT]; + +#if ENABLE_STATISTICS +//! Total number of mapped memory pages +static atomic32_t _mapped_pages; +//! Running counter of total number of mapped memory pages since start +static atomic32_t _mapped_total; +//! Running counter of total number of unmapped memory pages since start +static atomic32_t _unmapped_total; +#endif + +static void* +_memory_map(size_t page_count); + +static void +_memory_unmap(void* ptr, size_t page_count); + +static int +_memory_deallocate_deferred(heap_t* heap, size_t size_class); + +//! Lookup a memory heap from heap ID +static heap_t* +_memory_heap_lookup(int32_t id) { + uint32_t list_idx = id % HEAP_ARRAY_SIZE; + heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]); + while (heap && (heap->id != id)) + heap = heap->next_heap; + return heap; +} + +//! Get the span size class from page count +static size_t +_span_class_from_page_count(size_t page_count) { + assert((page_count > 0) && (page_count <= 16)); + return ((page_count + SPAN_CLASS_GRANULARITY - 1) / SPAN_CLASS_GRANULARITY) - 1; +} + +//! Increase an allocation counter +static void +_memory_counter_increase(span_counter_t* counter, uint32_t* global_counter) { + if (++counter->current_allocations > counter->max_allocations) { + counter->max_allocations = counter->current_allocations; +#if MAX_SPAN_CACHE_DIVISOR > 0 + counter->cache_limit = counter->max_allocations / MAX_SPAN_CACHE_DIVISOR; +#endif + if (counter->max_allocations > *global_counter) + *global_counter = counter->max_allocations; + } +} + +//! Insert the given list of memory page spans in the global cache for small/medium blocks +static void +_memory_global_cache_insert(span_t* first_span, size_t list_size, size_t page_count) { + assert((list_size == 1) || (first_span->next_span != 0)); +#if MAX_SPAN_CACHE_DIVISOR > 0 + while (1) { + size_t span_class_idx = _span_class_from_page_count(page_count); + void* global_span_ptr = atomic_load_ptr(&_memory_span_cache[span_class_idx]); + if (global_span_ptr != SPAN_LIST_LOCK_TOKEN) { + uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~SPAN_MASK; + span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK)); + +#ifdef GLOBAL_SPAN_CACHE_MULTIPLIER + size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation[span_class_idx] / MAX_SPAN_CACHE_DIVISOR); + if ((global_list_size >= cache_limit) && (global_list_size > MIN_SPAN_CACHE_SIZE)) + break; +#endif + //We only have 16 bits for size of list, avoid overflow + if ((global_list_size + list_size) > 0xFFFF) + break; + + //Use prev_span as skip pointer over this sublist range of spans + first_span->data.list_size = (uint32_t)list_size; + first_span->prev_span = global_span; + + //Insert sublist into global cache + global_list_size += list_size; + void* first_span_ptr = (void*)((uintptr_t)first_span | global_list_size); + if (atomic_cas_ptr(&_memory_span_cache[span_class_idx], first_span_ptr, global_span_ptr)) + return; + } + else { + //Atomic operation failed, yield timeslice and retry + thread_yield(); + atomic_thread_fence_acquire(); + } + } +#endif + //Global cache full, release pages + for (size_t ispan = 0; ispan < list_size; ++ispan) { + assert(first_span); + span_t* next_span = first_span->next_span; + _memory_unmap(first_span, page_count); + first_span = next_span; + } +} + +//! Extract a number of memory page spans from the global cache for small/medium blocks +static span_t* +_memory_global_cache_extract(size_t page_count) { + span_t* span = 0; + size_t span_class_idx = _span_class_from_page_count(page_count); + atomicptr_t* cache = &_memory_span_cache[span_class_idx]; + atomic_thread_fence_acquire(); + void* global_span_ptr = atomic_load_ptr(cache); + while (global_span_ptr) { + if ((global_span_ptr != SPAN_LIST_LOCK_TOKEN) && + atomic_cas_ptr(cache, SPAN_LIST_LOCK_TOKEN, global_span_ptr)) { + //Grab a number of thread cache spans, using the skip span pointer + //stored in prev_span to quickly skip ahead in the list to get the new head + uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~SPAN_MASK; + span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK)); + assert((span->data.list_size == 1) || (span->next_span != 0)); + + span_t* new_global_span = span->prev_span; + global_span_count -= span->data.list_size; + + //Set new head of global cache list + void* new_cache_head = global_span_count ? + ((void*)((uintptr_t)new_global_span | global_span_count)) : + 0; + atomic_store_ptr(cache, new_cache_head); + atomic_thread_fence_release(); + break; + } + + //List busy, yield timeslice and retry + thread_yield(); + atomic_thread_fence_acquire(); + global_span_ptr = atomic_load_ptr(cache); + } + + return span; +} + +/*! Insert the given list of memory page spans in the global cache for large blocks, + similar to _memory_global_cache_insert */ +static void +_memory_global_cache_large_insert(span_t* span_list, size_t list_size, size_t span_count) { + assert((list_size == 1) || (span_list->next_span != 0)); + assert(span_list->size_class == (SIZE_CLASS_COUNT + (span_count - 1))); +#if MAX_SPAN_CACHE_DIVISOR > 0 + atomicptr_t* cache = &_memory_large_cache[span_count - 1]; + while (1) { + void* global_span_ptr = atomic_load_ptr(cache); + if (global_span_ptr != SPAN_LIST_LOCK_TOKEN) { + uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~SPAN_MASK; + span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK)); + +#ifdef GLOBAL_SPAN_CACHE_MULTIPLIER + size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation_large[span_count-1] / MAX_SPAN_CACHE_DIVISOR); + if ((global_list_size >= cache_limit) && (global_list_size > MIN_SPAN_CACHE_SIZE)) + break; +#endif + if ((global_list_size + list_size) > 0xFFFF) + break; + + span_list->data.list_size = (uint32_t)list_size; + span_list->prev_span = global_span; + + global_list_size += list_size; + void* new_global_span_ptr = (void*)((uintptr_t)span_list | global_list_size); + if (atomic_cas_ptr(cache, new_global_span_ptr, global_span_ptr)) + return; + } + else { + thread_yield(); + atomic_thread_fence_acquire(); + } + } +#endif + //Global cache full, release spans + for (size_t ispan = 0; ispan < list_size; ++ispan) { + assert(span_list); + span_t* next_span = span_list->next_span; + _memory_unmap(span_list, span_count * SPAN_MAX_PAGE_COUNT); + span_list = next_span; + } +} + +/*! Extract a number of memory page spans from the global cache for large blocks, + similar to _memory_global_cache_extract */ +static span_t* +_memory_global_cache_large_extract(size_t span_count) { + span_t* span = 0; + atomicptr_t* cache = &_memory_large_cache[span_count - 1]; + atomic_thread_fence_acquire(); + void* global_span_ptr = atomic_load_ptr(cache); + while (global_span_ptr) { + if ((global_span_ptr != SPAN_LIST_LOCK_TOKEN) && + atomic_cas_ptr(cache, SPAN_LIST_LOCK_TOKEN, global_span_ptr)) { + uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~SPAN_MASK; + span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK)); + assert((span->data.list_size == 1) || (span->next_span != 0)); + assert(span->size_class == (SIZE_CLASS_COUNT + (span_count - 1))); + + span_t* new_global_span = span->prev_span; + global_list_size -= span->data.list_size; + + void* new_global_span_ptr = global_list_size ? + ((void*)((uintptr_t)new_global_span | global_list_size)) : + 0; + atomic_store_ptr(cache, new_global_span_ptr); + atomic_thread_fence_release(); + break; + } + + thread_yield(); + atomic_thread_fence_acquire(); + global_span_ptr = atomic_load_ptr(cache); + } + return span; +} + +//! Allocate a small/medium sized memory block from the given heap +static void* +_memory_allocate_from_heap(heap_t* heap, size_t size) { +#if ENABLE_STATISTICS + //For statistics we need to store the requested size in the memory block + size += sizeof(size_t); +#endif + + //Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes) + const size_t class_idx = _memory_size_class[(size <= SMALL_SIZE_LIMIT) ? + ((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT) - 1 : + SMALL_CLASS_COUNT + ((size - SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY - 1)) >> MEDIUM_GRANULARITY_SHIFT) - 1].class_idx; + + span_block_t* active_block = heap->active_block + class_idx; + size_class_t* size_class = _memory_size_class + class_idx; + const count_t class_size = size_class->size; + +#if ENABLE_STATISTICS + heap->allocated += class_size; + heap->requested += size; +#endif + + //Step 1: Try to get a block from the currently active span. The span block bookkeeping + // data for the active span is stored in the heap for faster access +use_active: + if (active_block->free_count) { + //Happy path, we have a span with at least one free block + span_t* span = heap->active_span[class_idx]; + count_t offset = class_size * active_block->free_list; + uint32_t* block = pointer_offset(span, SPAN_HEADER_SIZE + offset); + assert(span); + + --active_block->free_count; + if (!active_block->free_count) { + //Span is now completely allocated, set the bookkeeping data in the + //span itself and reset the active span pointer in the heap + span->data.block.free_count = 0; + span->data.block.first_autolink = (uint16_t)size_class->block_count; + heap->active_span[class_idx] = 0; + } + else { + //Get the next free block, either from linked list or from auto link + if (active_block->free_list < active_block->first_autolink) { + active_block->free_list = (uint16_t)(*block); + } + else { + ++active_block->free_list; + ++active_block->first_autolink; + } + assert(active_block->free_list < size_class->block_count); + } + +#if ENABLE_STATISTICS + //Store the requested size for statistics + *(size_t*)pointer_offset(block, class_size - sizeof(size_t)) = size; +#endif + + return block; + } + + //Step 2: No active span, try executing deferred deallocations and try again if there + // was at least one of the reqeusted size class + if (_memory_deallocate_deferred(heap, class_idx)) { + if (active_block->free_count) + goto use_active; + } + + //Step 3: Check if there is a semi-used span of the requested size class available + if (heap->size_cache[class_idx]) { + //Promote a pending semi-used span to be active, storing bookkeeping data in + //the heap structure for faster access + span_t* span = heap->size_cache[class_idx]; + *active_block = span->data.block; + assert(active_block->free_count > 0); + span_t* next_span = span->next_span; + heap->size_cache[class_idx] = next_span; + heap->active_span[class_idx] = span; + goto use_active; + } + + //Step 4: No semi-used span available, try grab a span from the thread cache + size_t span_class_idx = _span_class_from_page_count(size_class->page_count); + span_t* span = heap->span_cache[span_class_idx]; + if (!span) { + //Step 5: No span available in the thread cache, try grab a list of spans from the global cache + span = _memory_global_cache_extract(size_class->page_count); +#if ENABLE_STATISTICS + if (span) + heap->global_to_thread += (size_t)span->data.list_size * size_class->page_count * PAGE_SIZE; +#endif + } + if (span) { + if (span->data.list_size > 1) { + //We got a list of spans, we will use first as active and store remainder in thread cache + span_t* next_span = span->next_span; + assert(next_span); + next_span->data.list_size = span->data.list_size - 1; + heap->span_cache[span_class_idx] = next_span; + } + else { + heap->span_cache[span_class_idx] = 0; + } + } + else { + //Step 6: All caches empty, map in new memory pages + span = _memory_map(size_class->page_count); + } + + //Mark span as owned by this heap and set base data + atomic_store32(&span->heap_id, heap->id); + atomic_thread_fence_release(); + + span->size_class = (count_t)class_idx; + + //If we only have one block we will grab it, otherwise + //set span as new span to use for next allocation + if (size_class->block_count > 1) { + //Reset block order to sequential auto linked order + active_block->free_count = (uint16_t)(size_class->block_count - 1); + active_block->free_list = 1; + active_block->first_autolink = 1; + heap->active_span[class_idx] = span; + } + else { + span->data.block.free_count = 0; + span->data.block.first_autolink = (uint16_t)size_class->block_count; + } + + //Track counters + _memory_counter_increase(&heap->span_counter[span_class_idx], &_memory_max_allocation[span_class_idx]); + +#if ENABLE_STATISTICS + //Store the requested size for statistics + *(size_t*)pointer_offset(span, SPAN_HEADER_SIZE + class_size - sizeof(size_t)) = size; +#endif + + //Return first block if memory page span + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Allocate a large sized memory block from the given heap +static void* +_memory_allocate_large_from_heap(heap_t* heap, size_t size) { + //Calculate number of needed max sized spans (including header) + size += SPAN_HEADER_SIZE; + size_t num_spans = size / SPAN_MAX_SIZE; + if (size % SPAN_MAX_SIZE) + ++num_spans; + size_t idx = num_spans - 1; + + if (!idx) { + size_t span_class_idx = _span_class_from_page_count(SPAN_MAX_PAGE_COUNT); + span_t* span = heap->span_cache[span_class_idx]; + if (!span) { + _memory_deallocate_deferred(heap, 0); + span = heap->span_cache[span_class_idx]; + } + if (!span) { + //Step 5: No span available in the thread cache, try grab a list of spans from the global cache + span = _memory_global_cache_extract(SPAN_MAX_PAGE_COUNT); +#if ENABLE_STATISTICS + if (span) + heap->global_to_thread += (size_t)span->data.list_size * SPAN_MAX_PAGE_COUNT * PAGE_SIZE; +#endif + } + if (span) { + if (span->data.list_size > 1) { + //We got a list of spans, we will use first as active and store remainder in thread cache + span_t* next_span = span->next_span; + assert(next_span); + next_span->data.list_size = span->data.list_size - 1; + heap->span_cache[span_class_idx] = next_span; + } + else { + heap->span_cache[span_class_idx] = 0; + } + } + else { + //Step 6: All caches empty, map in new memory pages + span = _memory_map(SPAN_MAX_PAGE_COUNT); + } + + //Mark span as owned by this heap and set base data + atomic_store32(&span->heap_id, heap->id); + atomic_thread_fence_release(); + + span->size_class = SIZE_CLASS_COUNT; + + //Track counters + _memory_counter_increase(&heap->span_counter[span_class_idx], &_memory_max_allocation[span_class_idx]); + + return pointer_offset(span, SPAN_HEADER_SIZE); + } + +use_cache: + //Step 1: Check if cache for this large size class (or the following, unless first class) has a span + while (!heap->large_cache[idx] && (idx < LARGE_CLASS_COUNT) && (idx < num_spans + 1)) + ++idx; + span_t* span = heap->large_cache[idx]; + if (span) { + //Happy path, use from cache + if (span->data.list_size > 1) { + span_t* new_head = span->next_span; + assert(new_head); + new_head->data.list_size = span->data.list_size - 1; + heap->large_cache[idx] = new_head; + } + else { + heap->large_cache[idx] = 0; + } + + span->size_class = SIZE_CLASS_COUNT + (count_t)idx; + + //Increase counter + _memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx]); + + return pointer_offset(span, SPAN_HEADER_SIZE); + } + + //Restore index, we're back to smallest fitting span count + idx = num_spans - 1; + + //Step 2: Process deferred deallocation + if (_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx)) + goto use_cache; + assert(!heap->large_cache[idx]); + + //Step 3: Extract a list of spans from global cache + span = _memory_global_cache_large_extract(num_spans); + if (span) { +#if ENABLE_STATISTICS + heap->global_to_thread += (size_t)span->data.list_size * num_spans * SPAN_MAX_SIZE; +#endif + //We got a list from global cache, store remainder in thread cache + if (span->data.list_size > 1) { + span_t* new_head = span->next_span; + assert(new_head); + new_head->prev_span = 0; + new_head->data.list_size = span->data.list_size - 1; + heap->large_cache[idx] = new_head; + } + } + else { + //Step 4: Map in more memory pages + span = _memory_map(num_spans * SPAN_MAX_PAGE_COUNT); + } + //Mark span as owned by this heap + atomic_store32(&span->heap_id, heap->id); + atomic_thread_fence_release(); + + span->size_class = SIZE_CLASS_COUNT + (count_t)idx; + + //Increase counter + _memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx]); + + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Allocate a new heap +static heap_t* +_memory_allocate_heap(void) { + heap_t* heap; + heap_t* next_heap; + //Try getting an orphaned heap + atomic_thread_fence_acquire(); + do { + heap = atomic_load_ptr(&_memory_orphan_heaps); + if (!heap) + break; + next_heap = heap->next_orphan; + } + while (!atomic_cas_ptr(&_memory_orphan_heaps, next_heap, heap)); + + if (heap) { + heap->next_orphan = 0; + return heap; + } + + //Map in pages for a new heap + heap = _memory_map(2); + memset(heap, 0, sizeof(heap_t)); + + //Get a new heap ID + do { + heap->id = atomic_incr32(&_memory_heap_id); + if (_memory_heap_lookup(heap->id)) + heap->id = 0; + } + while (!heap->id); + + //Link in heap in heap ID map + size_t list_idx = heap->id % HEAP_ARRAY_SIZE; + do { + next_heap = atomic_load_ptr(&_memory_heaps[list_idx]); + heap->next_heap = next_heap; + } + while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap)); + + return heap; +} + +//! Add a span to a double linked list +static void +_memory_list_add(span_t** head, span_t* span) { + if (*head) { + (*head)->prev_span = span; + span->next_span = *head; + } + else { + span->next_span = 0; + } + *head = span; +} + +//! Remove a span from a double linked list +static void +_memory_list_remove(span_t** head, span_t* span) { + if (*head == span) { + *head = span->next_span; + } + else { + if (span->next_span) + span->next_span->prev_span = span->prev_span; + span->prev_span->next_span = span->next_span; + } +} + +//! Insert span into thread cache, releasing to global cache if overflow +static void +_memory_heap_cache_insert(heap_t* heap, span_t* span, size_t page_count) { +#if MAX_SPAN_CACHE_DIVISOR == 0 + (void)sizeof(heap); + _memory_global_cache_insert(span, 1, page_count); +#else + size_t span_class_idx = _span_class_from_page_count(page_count); + span_t** cache = &heap->span_cache[span_class_idx]; + span->next_span = *cache; + if (*cache) + span->data.list_size = (*cache)->data.list_size + 1; + else + span->data.list_size = 1; + *cache = span; +#if MAX_SPAN_CACHE_DIVISOR > 1 + //Check if cache exceeds limit + if ((span->data.list_size >= (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE)) && + (span->data.list_size > heap->span_counter[span_class_idx].cache_limit)) { + //Release to global cache + count_t list_size = 1; + span_t* next = span->next_span; + span_t* last = span; + while (list_size < MIN_SPAN_CACHE_RELEASE) { + last = next; + next = next->next_span; + ++list_size; + } + next->data.list_size = span->data.list_size - list_size; + last->next_span = 0; //Terminate list + *cache = next; + _memory_global_cache_insert(span, list_size, page_count); +#if ENABLE_STATISTICS + heap->thread_to_global += list_size * page_count * PAGE_SIZE; +#endif + } +#endif +#endif +} + +//! Deallocate the given small/medium memory block from the given heap +static void +_memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) { + //Check if span is the currently active span in order to operate + //on the correct bookkeeping data + const count_t class_idx = span->size_class; + size_class_t* size_class = _memory_size_class + class_idx; + int is_active = (heap->active_span[class_idx] == span); + span_block_t* block_data = is_active ? + heap->active_block + class_idx : + &span->data.block; + +#if ENABLE_STATISTICS + heap->allocated -= size_class->size; + heap->requested -= *(size_t*)pointer_offset(p, size_class->size - sizeof(size_t)); +#endif + + //Check if the span will become completely free + if (block_data->free_count == ((count_t)size_class->block_count - 1)) { + //Track counters + size_t span_class_idx = _span_class_from_page_count(size_class->page_count); + assert(heap->span_counter[span_class_idx].current_allocations > 0); + --heap->span_counter[span_class_idx].current_allocations; + + //If it was active, reset counter. Otherwise, if not active, remove from + //partial free list if we had a previous free block (guard for classes with only 1 block) + if (is_active) + block_data->free_count = 0; + else if (block_data->free_count > 0) + _memory_list_remove(&heap->size_cache[class_idx], span); + + //Add to span cache + _memory_heap_cache_insert(heap, span, size_class->page_count); + return; + } + + //Check if first free block for this span (previously fully allocated) + if (block_data->free_count == 0) { + //add to free list and disable autolink + _memory_list_add(&heap->size_cache[class_idx], span); + block_data->first_autolink = (uint16_t)size_class->block_count; + } + ++block_data->free_count; + //Span is not yet completely free, so add block to the linked list of free blocks + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + count_t block_offset = (count_t)pointer_diff(p, blocks_start); + count_t block_idx = block_offset / (count_t)size_class->size; + uint32_t* block = pointer_offset(blocks_start, block_idx * size_class->size); + *block = block_data->free_list; + block_data->free_list = (uint16_t)block_idx; +} + +//! Deallocate the given large memory block from the given heap +static void +_memory_deallocate_large_to_heap(heap_t* heap, span_t* span) { + //Check if aliased with 64KiB small/medium spans + if (span->size_class == SIZE_CLASS_COUNT) { + //Track counters + size_t span_class_idx = _span_class_from_page_count(SPAN_MAX_PAGE_COUNT); + --heap->span_counter[span_class_idx].current_allocations; + //Add to span cache + _memory_heap_cache_insert(heap, span, SPAN_MAX_PAGE_COUNT); + return; + } + + //Decrease counter + size_t idx = span->size_class - SIZE_CLASS_COUNT; + span_counter_t* counter = heap->large_counter + idx; + assert(counter->current_allocations > 0); + --counter->current_allocations; + +#if MAX_SPAN_CACHE_DIVISOR == 0 + _memory_global_cache_large_insert(span, 1, idx + 1); +#else + //Insert into cache list + span_t** cache = heap->large_cache + idx; + span->next_span = *cache; + if (*cache) + span->data.list_size = (*cache)->data.list_size + 1; + else + span->data.list_size = 1; + *cache = span; +#if MAX_SPAN_CACHE_DIVISOR > 1 + //Check if cache exceeds limit + if ((span->data.list_size >= (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE)) && + (span->data.list_size > counter->cache_limit)) { + //Release to global cache + count_t list_size = 1; + span_t* next = span->next_span; + span_t* last = span; + while (list_size < MIN_SPAN_CACHE_RELEASE) { + last = next; + next = next->next_span; + ++list_size; + } + assert(next->next_span); + next->data.list_size = span->data.list_size - list_size; + last->next_span = 0; //Terminate list + *cache = next; + _memory_global_cache_large_insert(span, list_size, idx + 1); +#if ENABLE_STATISTICS + heap->thread_to_global += list_size * (idx + 1) * SPAN_MAX_SIZE; +#endif + } +#endif +#endif +} + +//! Process pending deferred cross-thread deallocations +static int +_memory_deallocate_deferred(heap_t* heap, size_t size_class) { + //Grab the current list of deferred deallocations + atomic_thread_fence_acquire(); + void* p = atomic_load_ptr(&heap->defer_deallocate); + if (!p) + return 0; + if (!atomic_cas_ptr(&heap->defer_deallocate, 0, p)) + return 0; + //Keep track if we deallocate in the given size class + int got_class = 0; + do { + void* next = *(void**)p; + //Get span and check which type of block + span_t* span = (void*)((uintptr_t)p & SPAN_MASK); + if (span->size_class < SIZE_CLASS_COUNT) { + //Small/medium block + got_class |= (span->size_class == size_class); + _memory_deallocate_to_heap(heap, span, p); + } + else { + //Large block + got_class |= ((span->size_class >= size_class) && (span->size_class <= (size_class + 2))); + _memory_deallocate_large_to_heap(heap, span); + } + //Loop until all pending operations in list are processed + p = next; + } while (p); + return got_class; +} + +//! Defer deallocation of the given block to the given heap +static void +_memory_deallocate_defer(int32_t heap_id, void* p) { + //Get the heap and link in pointer in list of deferred opeations + heap_t* heap = _memory_heap_lookup(heap_id); + void* last_ptr; + do { + last_ptr = atomic_load_ptr(&heap->defer_deallocate); + *(void**)p = last_ptr; //Safe to use block, it's being deallocated + } while (!atomic_cas_ptr(&heap->defer_deallocate, p, last_ptr)); +} + +//! Allocate a block of the given size +static void* +_memory_allocate(size_t size) { + if (size <= MEDIUM_SIZE_LIMIT) + return _memory_allocate_from_heap(_memory_thread_heap, size); + else if (size <= LARGE_SIZE_LIMIT) + return _memory_allocate_large_from_heap(_memory_thread_heap, size); + + //Oversized, allocate pages directly + size += SPAN_HEADER_SIZE; + size_t num_pages = size / PAGE_SIZE; + if (size % PAGE_SIZE) + ++num_pages; + span_t* span = _memory_map(num_pages); + atomic_store32(&span->heap_id, 0); + //Store page count in next_span + span->next_span = (span_t*)((uintptr_t)num_pages); + + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Deallocate the given block +static void +_memory_deallocate(void* p) { + if (!p) + return; + + //Grab the span (always at start of span, using 64KiB alignment) + span_t* span = (void*)((uintptr_t)p & SPAN_MASK); + int32_t heap_id = atomic_load32(&span->heap_id); + heap_t* heap = _memory_thread_heap; + //Check if block belongs to this heap or if deallocation should be deferred + if (heap_id == heap->id) { + if (span->size_class < SIZE_CLASS_COUNT) + _memory_deallocate_to_heap(heap, span, p); + else + _memory_deallocate_large_to_heap(heap, span); + } + else if (heap_id > 0) { + _memory_deallocate_defer(heap_id, p); + } + else { + //Oversized allocation, page count is stored in next_span + size_t num_pages = (size_t)span->next_span; + _memory_unmap(span, num_pages); + } +} + +//! Reallocate the given block to the given size +static void* +_memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) { + if (p) { + //Grab the span (always at start of span, using 64KiB alignment) + span_t* span = (void*)((uintptr_t)p & SPAN_MASK); + int32_t heap_id = atomic_load32(&span->heap_id); + if (heap_id) { + if (span->size_class < SIZE_CLASS_COUNT) { + //Small/medium sized block + size_class_t* size_class = _memory_size_class + span->size_class; + if ((size_t)size_class->size >= size) + return p; //Still fits in block, never mind trying to save memory + if (!oldsize) + oldsize = size_class->size; + } + else { + //Large block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_spans = total_size / SPAN_MAX_SIZE; + if (total_size % SPAN_MAX_SIZE) + ++num_spans; + size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; + if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) + return p; //Still fits and less than half of memory would be freed + if (!oldsize) + oldsize = (current_spans * (size_t)SPAN_MAX_SIZE) - SPAN_HEADER_SIZE; + } + } + else { + //Oversized block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_pages = total_size / PAGE_SIZE; + if (total_size % PAGE_SIZE) + ++num_pages; + //Page count is stored in next_span + size_t current_pages = (size_t)span->next_span; + if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) + return p; //Still fits and less than half of memory would be freed + if (!oldsize) + oldsize = (current_pages * (size_t)PAGE_SIZE) - SPAN_HEADER_SIZE; + } + } + + //Size is greater than block size, need to allocate a new block and deallocate the old + //Avoid hysteresis by overallocating if increase is small (below 37%) + size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); + void* block = _memory_allocate(size > lower_bound ? size : lower_bound); + if (p) { + if (!(flags & RPMALLOC_NO_PRESERVE)) + memcpy(block, p, oldsize < size ? oldsize : size); + _memory_deallocate(p); + } + + return block; +} + +//! Get the usable size of the given block +static size_t +_memory_usable_size(void* p) { + //Grab the span (always at start of span, using 64KiB alignment) + span_t* span = (void*)((uintptr_t)p & SPAN_MASK); + int32_t heap_id = atomic_load32(&span->heap_id); + if (heap_id) { + if (span->size_class < SIZE_CLASS_COUNT) { + //Small/medium block + size_class_t* size_class = _memory_size_class + span->size_class; + return size_class->size; + } + + //Large block + size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; + return (current_spans * (size_t)SPAN_MAX_SIZE) - SPAN_HEADER_SIZE; + } + + //Oversized block, page count is stored in next_span + size_t current_pages = (size_t)span->next_span; + return (current_pages * (size_t)PAGE_SIZE) - SPAN_HEADER_SIZE; +} + +//! Adjust and optimize the size class properties for the given class +static void +_memory_adjust_size_class(size_t iclass) { + //Calculate how many pages are needed for 255 blocks + size_t block_size = _memory_size_class[iclass].size; + size_t page_count = (block_size * 255) / PAGE_SIZE; + //Cap to 16 pages (64KiB span granularity) + page_count = (page_count == 0) ? 1 : ((page_count > 16) ? 16 : page_count); + //Merge page counts to span size class granularity + page_count = ((page_count + (SPAN_CLASS_GRANULARITY - 1)) / SPAN_CLASS_GRANULARITY) * SPAN_CLASS_GRANULARITY; + if (page_count > 16) + page_count = 16; + size_t block_count = ((page_count * PAGE_SIZE) - SPAN_HEADER_SIZE) / block_size; + //Store the final configuration + _memory_size_class[iclass].page_count = (uint16_t)page_count; + _memory_size_class[iclass].block_count = (uint16_t)block_count; + _memory_size_class[iclass].class_idx = (uint16_t)iclass; + + //Check if previous size classes can be merged + size_t prevclass = iclass; + while (prevclass > 0) { + --prevclass; + //A class can be merged if number of pages and number of blocks are equal + if ((_memory_size_class[prevclass].page_count == _memory_size_class[iclass].page_count) && + (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)) { + memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass])); + } + else { + break; + } + } +} + +#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 ) +# include +#else +# include +# include +# include +# ifndef MAP_UNINITIALIZED +# define MAP_UNINITIALIZED 0 +# endif +#endif + +//! Initialize the allocator and setup global data +int +rpmalloc_initialize(void) { +#ifdef PLATFORM_WINDOWS + SYSTEM_INFO system_info; + memset(&system_info, 0, sizeof(system_info)); + GetSystemInfo(&system_info); + if (system_info.dwAllocationGranularity < SPAN_ADDRESS_GRANULARITY) + return -1; +#else +#if ARCH_64BIT + atomic_store64(&_memory_addr, 0x1000000000ULL); +#else + atomic_store64(&_memory_addr, 0x1000000ULL); +#endif +#endif + + atomic_store32(&_memory_heap_id, 0); + + //Setup all small and medium size classes + size_t iclass; + for (iclass = 0; iclass < SMALL_CLASS_COUNT; ++iclass) { + size_t size = (iclass + 1) * SMALL_GRANULARITY; + _memory_size_class[iclass].size = (uint16_t)size; + _memory_adjust_size_class(iclass); + } + for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { + size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); + if (size > MEDIUM_SIZE_LIMIT) + size = MEDIUM_SIZE_LIMIT; + _memory_size_class[SMALL_CLASS_COUNT + iclass].size = (uint16_t)size; + _memory_adjust_size_class(SMALL_CLASS_COUNT + iclass); + } + + //Initialize this thread + rpmalloc_thread_initialize(); + return 0; +} + +//! Finalize the allocator +void +rpmalloc_finalize(void) { + atomic_thread_fence_acquire(); + + //Free all thread caches + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]); + while (heap) { + _memory_deallocate_deferred(heap, 0); + + for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) { + const size_t page_count = (iclass + 1) * SPAN_CLASS_GRANULARITY; + span_t* span = heap->span_cache[iclass]; + unsigned int span_count = span ? span->data.list_size : 0; + for (unsigned int ispan = 0; ispan < span_count; ++ispan) { + span_t* next_span = span->next_span; + _memory_unmap(span, page_count); + span = next_span; + } + } + + //Free large spans + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + const size_t span_count = iclass + 1; + span_t* span = heap->large_cache[iclass]; + while (span) { + span_t* next_span = span->next_span; + _memory_unmap(span, span_count * SPAN_MAX_PAGE_COUNT); + span = next_span; + } + } + + heap_t* next_heap = heap->next_heap; + _memory_unmap(heap, 2); + heap = next_heap; + } + + atomic_store_ptr(&_memory_heaps[list_idx], 0); + } + atomic_store_ptr(&_memory_orphan_heaps, 0); + + //Free global caches + for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) { + void* span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]); + size_t cache_count = (uintptr_t)span_ptr & ~SPAN_MASK; + span_t* span = (span_t*)((void*)((uintptr_t)span_ptr & SPAN_MASK)); + while (cache_count) { + span_t* skip_span = span->prev_span; + unsigned int span_count = span->data.list_size; + for (unsigned int ispan = 0; ispan < span_count; ++ispan) { + span_t* next_span = span->next_span; + _memory_unmap(span, (iclass + 1) * SPAN_CLASS_GRANULARITY); + span = next_span; + } + span = skip_span; + cache_count -= span_count; + } + atomic_store_ptr(&_memory_span_cache[iclass], 0); + } + + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + void* span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]); + size_t cache_count = (uintptr_t)span_ptr & ~SPAN_MASK; + span_t* span = (span_t*)((void*)((uintptr_t)span_ptr & SPAN_MASK)); + while (cache_count) { + span_t* skip_span = span->prev_span; + unsigned int span_count = span->data.list_size; + for (unsigned int ispan = 0; ispan < span_count; ++ispan) { + span_t* next_span = span->next_span; + _memory_unmap(span, (iclass + 1) * SPAN_MAX_PAGE_COUNT); + span = next_span; + } + span = skip_span; + cache_count -= span_count; + } + atomic_store_ptr(&_memory_large_cache[iclass], 0); + } + + atomic_thread_fence_release(); +} + +//! Initialize thread, assign heap +void +rpmalloc_thread_initialize(void) { + if (!_memory_thread_heap) { + heap_t* heap = _memory_allocate_heap(); +#if ENABLE_STATISTICS + heap->thread_to_global = 0; + heap->global_to_thread = 0; +#endif + _memory_thread_heap = heap; + atomic_incr32(&_memory_active_heaps); + } +} + +//! Finalize thread, orphan heap +void +rpmalloc_thread_finalize(void) { + heap_t* heap = _memory_thread_heap; + if (!heap) + return; + + atomic_add32(&_memory_active_heaps, -1); + + _memory_deallocate_deferred(heap, 0); + + //Release thread cache spans back to global cache + for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) { + const size_t page_count = (iclass + 1) * SPAN_CLASS_GRANULARITY; + span_t* span = heap->span_cache[iclass]; + while (span) { + if (span->data.list_size > MIN_SPAN_CACHE_RELEASE) { + count_t list_size = 1; + span_t* next = span->next_span; + span_t* last = span; + while (list_size < MIN_SPAN_CACHE_RELEASE) { + last = next; + next = next->next_span; + ++list_size; + } + last->next_span = 0; //Terminate list + next->data.list_size = span->data.list_size - list_size; + _memory_global_cache_insert(span, list_size, page_count); + span = next; + } + else { + _memory_global_cache_insert(span, span->data.list_size, page_count); + span = 0; + } + } + heap->span_cache[iclass] = 0; + } + + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + const size_t span_count = iclass + 1; + span_t* span = heap->large_cache[iclass]; + while (span) { + if (span->data.list_size > MIN_SPAN_CACHE_RELEASE) { + count_t list_size = 1; + span_t* next = span->next_span; + span_t* last = span; + while (list_size < MIN_SPAN_CACHE_RELEASE) { + last = next; + next = next->next_span; + ++list_size; + } + last->next_span = 0; //Terminate list + next->data.list_size = span->data.list_size - list_size; + _memory_global_cache_large_insert(span, list_size, span_count); + span = next; + } + else { + _memory_global_cache_large_insert(span, span->data.list_size, span_count); + span = 0; + } + } + heap->large_cache[iclass] = 0; + } + + //Reset allocation counters + memset(heap->span_counter, 0, sizeof(heap->span_counter)); + memset(heap->large_counter, 0, sizeof(heap->large_counter)); +#if ENABLE_STATISTICS + heap->requested = 0; + heap->allocated = 0; + heap->thread_to_global = 0; + heap->global_to_thread = 0; +#endif + + //Orphan the heap + heap_t* last_heap; + do { + last_heap = atomic_load_ptr(&_memory_orphan_heaps); + heap->next_orphan = last_heap; + } + while (!atomic_cas_ptr(&_memory_orphan_heaps, heap, last_heap)); + + _memory_thread_heap = 0; +} + +int +rpmalloc_is_thread_initialized(void) { + return (_memory_thread_heap != 0) ? 1 : 0; +} + +//! Map new pages to virtual memory +static void* +_memory_map(size_t page_count) { + size_t total_size = page_count * PAGE_SIZE; + void* pages_ptr = 0; + +#if ENABLE_STATISTICS + atomic_add32(&_mapped_pages, (int32_t)page_count); + atomic_add32(&_mapped_total, (int32_t)page_count); +#endif + +#ifdef PLATFORM_WINDOWS + pages_ptr = VirtualAlloc(0, total_size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); +#else + //mmap lacks a way to set 64KiB address granularity, implement it locally + intptr_t incr = (intptr_t)total_size / (intptr_t)SPAN_ADDRESS_GRANULARITY; + if (total_size % SPAN_ADDRESS_GRANULARITY) + ++incr; + do { + void* base_addr = (void*)(uintptr_t)atomic_exchange_and_add64(&_memory_addr, + (incr * (intptr_t)SPAN_ADDRESS_GRANULARITY)); + pages_ptr = mmap(base_addr, total_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED, -1, 0); + if (pages_ptr != MAP_FAILED) { + if (pages_ptr != base_addr) { + void* new_base = (void*)((uintptr_t)pages_ptr & SPAN_MASK); + atomic_store64(&_memory_addr, (int64_t)((uintptr_t)new_base) + + ((incr + 1) * (intptr_t)SPAN_ADDRESS_GRANULARITY)); + atomic_thread_fence_release(); + } + if (!((uintptr_t)pages_ptr & ~SPAN_MASK)) + break; + munmap(pages_ptr, total_size); + } + } + while (1); +#endif + + return pages_ptr; +} + +//! Unmap pages from virtual memory +static void +_memory_unmap(void* ptr, size_t page_count) { +#if ENABLE_STATISTICS + atomic_add32(&_mapped_pages, -(int32_t)page_count); + atomic_add32(&_unmapped_total, (int32_t)page_count); +#endif + +#ifdef PLATFORM_WINDOWS + VirtualFree(ptr, 0, MEM_RELEASE); +#else + munmap(ptr, PAGE_SIZE * page_count); +#endif +} + +static FORCEINLINE int +atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { +#ifdef _MSC_VER +# if ARCH_64BIT + return (_InterlockedCompareExchange64((volatile long long*)&dst->nonatomic, + (long long)val, (long long)ref) == (long long)ref) ? 1 : 0; +# else + return (_InterlockedCompareExchange((volatile long*)&dst->nonatomic, + (long)val, (long)ref) == (long)ref) ? 1 : 0; +# endif +#else + return __sync_bool_compare_and_swap(&dst->nonatomic, ref, val); +#endif +} + +//! Yield the thread remaining timeslice +static void +thread_yield(void) { +#ifdef PLATFORM_WINDOWS + YieldProcessor(); +#else + sched_yield(); +#endif +} + +// Extern interface + +void* +rpmalloc(size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + return _memory_allocate(size); +} + +void +rpfree(void* ptr) { + _memory_deallocate(ptr); +} + +void* +rpcalloc(size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#ifdef PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + void* ptr = _memory_allocate(total); + memset(ptr, 0, total); + return ptr; +} + +void* +rprealloc(void* ptr, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return ptr; + } +#endif + return _memory_reallocate(ptr, size, 0, 0); +} + +void* +rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, + unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if (size + alignment < size) { + errno = EINVAL; + return 0; + } +#endif + //TODO: If alignment > 16, we need to copy to new aligned position + (void)sizeof(alignment); + return _memory_reallocate(ptr, size, oldsize, flags); +} + +void* +rpaligned_alloc(size_t alignment, size_t size) { + if (alignment <= 16) + return rpmalloc(size); + +#if ENABLE_VALIDATE_ARGS + if (size + alignment < size) { + errno = EINVAL; + return 0; + } +#endif + + void* ptr = rpmalloc(size + alignment); + if ((uintptr_t)ptr & (alignment - 1)) + ptr = (void*)(((uintptr_t)ptr & ~((uintptr_t)alignment - 1)) + alignment); + return ptr; +} + +void* +rpmemalign(size_t alignment, size_t size) { + return rpaligned_alloc(alignment, size); +} + +int +rpposix_memalign(void **memptr, size_t alignment, size_t size) { + if (memptr) + *memptr = rpaligned_alloc(alignment, size); + else + return EINVAL; + return *memptr ? 0 : ENOMEM; +} + +size_t +rpmalloc_usable_size(void* ptr) { + return ptr ? _memory_usable_size(ptr) : 0; +} + +void +rpmalloc_thread_collect(void) { + _memory_deallocate_deferred(_memory_thread_heap, 0); +} + +void +rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) { + memset(stats, 0, sizeof(rpmalloc_thread_statistics_t)); + heap_t* heap = _memory_thread_heap; +#if ENABLE_STATISTICS + stats->allocated = heap->allocated; + stats->requested = heap->requested; +#endif + void* p = atomic_load_ptr(&heap->defer_deallocate); + while (p) { + void* next = *(void**)p; + span_t* span = (void*)((uintptr_t)p & SPAN_MASK); + stats->deferred += _memory_size_class[span->size_class].size; + p = next; + } + + for (size_t isize = 0; isize < SIZE_CLASS_COUNT; ++isize) { + if (heap->active_block[isize].free_count) + stats->active += heap->active_block[isize].free_count * _memory_size_class[heap->active_span[isize]->size_class].size; + + span_t* cache = heap->size_cache[isize]; + while (cache) { + stats->sizecache = cache->data.block.free_count * _memory_size_class[cache->size_class].size; + cache = cache->next_span; + } + } + + for (size_t isize = 0; isize < SPAN_CLASS_COUNT; ++isize) { + if (heap->span_cache[isize]) + stats->spancache = (size_t)heap->span_cache[isize]->data.list_size * (isize + 1) * SPAN_CLASS_GRANULARITY * PAGE_SIZE; + } +} + +void +rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) { + memset(stats, 0, sizeof(rpmalloc_global_statistics_t)); +#if ENABLE_STATISTICS + stats->mapped = (size_t)atomic_load32(&_mapped_pages) * PAGE_SIZE; + stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * PAGE_SIZE; + stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * PAGE_SIZE; +#endif + for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) { + void* global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]); + while (global_span_ptr == SPAN_LIST_LOCK_TOKEN) { + thread_yield(); + global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]); + } + uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~SPAN_MASK; + size_t list_bytes = global_span_count * (iclass + 1) * SPAN_CLASS_GRANULARITY * PAGE_SIZE; + stats->cached += list_bytes; + } + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + void* global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]); + while (global_span_ptr == SPAN_LIST_LOCK_TOKEN) { + thread_yield(); + global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]); + } + uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~SPAN_MASK; + size_t list_bytes = global_span_count * (iclass + 1) * SPAN_MAX_PAGE_COUNT * PAGE_SIZE; + stats->cached_large += list_bytes; + } +} diff --git a/client/tracy_rpmalloc.hpp b/client/tracy_rpmalloc.hpp new file mode 100644 index 00000000..948b8727 --- /dev/null +++ b/client/tracy_rpmalloc.hpp @@ -0,0 +1,119 @@ +/* rpmalloc.h - Memory allocator - Public Domain - 2016 Mattias Jansson / Rampant Pixels + * + * This library provides a cross-platform lock free thread caching malloc implementation in C11. + * The latest source code is always available at + * + * https://github.com/rampantpixels/rpmalloc + * + * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. + * + */ + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__clang__) || defined(__GNUC__) +# define RPMALLOC_ATTRIBUTE __attribute__((__malloc__)) +# define RPMALLOC_CALL +#elif defined(_MSC_VER) +# define RPMALLOC_ATTRIBUTE +# define RPMALLOC_CALL __declspec(restrict) +#else +# define RPMALLOC_ATTRIBUTE +# define RPMALLOC_CALL +#endif + +//! Flag to rpaligned_realloc to not preserve content in reallocation +#define RPMALLOC_NO_PRESERVE 1 + +typedef struct rpmalloc_global_statistics_t { + //! Current amount of virtual memory mapped (only if ENABLE_STATISTICS=1) + size_t mapped; + //! Current amount of memory in global caches for small and medium sizes (<64KiB) + size_t cached; + //! Curren amount of memory in global caches for large sizes (>=64KiB) + size_t cached_large; + //! Total amount of memory mapped (only if ENABLE_STATISTICS=1) + size_t mapped_total; + //! Total amount of memory unmapped (only if ENABLE_STATISTICS=1) + size_t unmapped_total; +} rpmalloc_global_statistics_t; + +typedef struct rpmalloc_thread_statistics_t { + //! Amount of memory currently requested in allocations (only if ENABLE_STATISTICS=1) + size_t requested; + //! Amount of memory actually allocated in memory blocks (only if ENABLE_STATISTICS=1) + size_t allocated; + //! Current number of bytes available for allocation from active spans + size_t active; + //! Current number of bytes available in thread size class caches + size_t sizecache; + //! Current number of bytes available in thread span caches + size_t spancache; + //! Current number of bytes in pending deferred deallocations + size_t deferred; + //! Total number of bytes transitioned from thread cache to global cache + size_t thread_to_global; + //! Total number of bytes transitioned from global cache to thread cache + size_t global_to_thread; +} rpmalloc_thread_statistics_t; + +extern int +rpmalloc_initialize(void); + +extern void +rpmalloc_finalize(void); + +extern void +rpmalloc_thread_initialize(void); + +extern void +rpmalloc_thread_finalize(void); + +extern void +rpmalloc_thread_collect(void); + +extern int +rpmalloc_is_thread_initialized(void); + +extern void +rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats); + +extern void +rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats); + +extern RPMALLOC_CALL void* +rpmalloc(size_t size) RPMALLOC_ATTRIBUTE; + +extern void +rpfree(void* ptr); + +extern RPMALLOC_CALL void* +rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIBUTE; + +extern void* +rprealloc(void* ptr, size_t size); + +extern void* +rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags); + +extern RPMALLOC_CALL void* +rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIBUTE; + +extern RPMALLOC_CALL void* +rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIBUTE; + +extern int +rpposix_memalign(void **memptr, size_t alignment, size_t size); + +extern size_t +rpmalloc_usable_size(void* ptr); + +#ifdef __cplusplus +} +#endif