diff --git a/client/tracy_rpmalloc.cpp b/client/tracy_rpmalloc.cpp index 83ea6e68..cb9d75be 100644 --- a/client/tracy_rpmalloc.cpp +++ b/client/tracy_rpmalloc.cpp @@ -13,67 +13,80 @@ #include "tracy_rpmalloc.hpp" -// Build time configurable limits - -// Presets, if none is defined it will default to performance priority -//#define ENABLE_UNLIMITED_CACHE -//#define DISABLE_CACHE -//#define ENABLE_SPACE_PRIORITY_CACHE - -// Presets for cache limits -#if defined(ENABLE_UNLIMITED_CACHE) -// Unlimited caches -#define MIN_SPAN_CACHE_RELEASE 16 -#define MAX_SPAN_CACHE_DIVISOR 1 -#elif defined(DISABLE_CACHE) -//Disable cache -#define MIN_SPAN_CACHE_RELEASE 1 -#define MAX_SPAN_CACHE_DIVISOR 0 -#elif defined(ENABLE_SPACE_PRIORITY_CACHE) -// Space priority cache limits -#define MIN_SPAN_CACHE_SIZE 8 -#define MIN_SPAN_CACHE_RELEASE 8 -#define MAX_SPAN_CACHE_DIVISOR 16 -#define GLOBAL_SPAN_CACHE_MULTIPLIER 1 -#else -// Default - performance priority cache limits -//! Limit of thread cache in number of spans for each page count class (undefine for unlimited cache - i.e never release spans to global cache unless thread finishes) -//! Minimum cache size to remain after a release to global cache -#define MIN_SPAN_CACHE_SIZE 8 -//! Minimum number of spans to transfer between thread and global cache -#define MIN_SPAN_CACHE_RELEASE 16 -//! Maximum cache size divisor (max cache size will be max allocation count divided by this divisor) -#define MAX_SPAN_CACHE_DIVISOR 8 -//! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this) -#define GLOBAL_SPAN_CACHE_MULTIPLIER 4 -#endif - +/// Build time configurable limits +#ifndef HEAP_ARRAY_SIZE //! Size of heap hashmap #define HEAP_ARRAY_SIZE 79 - +#endif +#ifndef ENABLE_THREAD_CACHE +//! Enable per-thread cache +#define ENABLE_THREAD_CACHE 1 +#endif +#ifndef ENABLE_GLOBAL_CACHE +//! Enable global cache shared between all threads, requires thread cache +#define ENABLE_GLOBAL_CACHE 1 +#endif #ifndef ENABLE_VALIDATE_ARGS //! Enable validation of args to public entry points #define ENABLE_VALIDATE_ARGS 0 #endif - #ifndef ENABLE_STATISTICS //! Enable statistics collection #define ENABLE_STATISTICS 0 #endif - #ifndef ENABLE_ASSERTS //! Enable asserts #define ENABLE_ASSERTS 0 #endif +#ifndef ENABLE_PRELOAD +//! Support preloading +#define ENABLE_PRELOAD 0 +#endif +#ifndef ENABLE_GUARDS +//! Enable overwrite/underwrite guards +#define ENABLE_GUARDS 0 +#endif +#ifndef ENABLE_UNLIMITED_CACHE +//! Unlimited cache disables any cache limitations +#define ENABLE_UNLIMITED_CACHE 0 +#endif +#ifndef DEFAULT_SPAN_MAP_COUNT +//! Default number of spans to map in call to map more virtual memory +#define DEFAULT_SPAN_MAP_COUNT 16 +#endif +//! Minimum cache size to remain after a release to global cache +#define MIN_SPAN_CACHE_SIZE 64 +//! Minimum number of spans to transfer between thread and global cache +#define MIN_SPAN_CACHE_RELEASE 16 +//! Maximum cache size divisor (max cache size will be max allocation count divided by this divisor) +#define MAX_SPAN_CACHE_DIVISOR 4 +//! Minimum cache size to remain after a release to global cache, large spans +#define MIN_LARGE_SPAN_CACHE_SIZE 8 +//! Minimum number of spans to transfer between thread and global cache, large spans +#define MIN_LARGE_SPAN_CACHE_RELEASE 4 +//! Maximum cache size divisor, large spans (max cache size will be max allocation count divided by this divisor) +#define MAX_LARGE_SPAN_CACHE_DIVISOR 16 +//! Multiplier for global span cache limit (max cache size will be calculated like thread cache and multiplied with this) +#define MAX_GLOBAL_CACHE_MULTIPLIER 8 -// Platform and arch specifics +#if !ENABLE_THREAD_CACHE +# undef ENABLE_GLOBAL_CACHE +# define ENABLE_GLOBAL_CACHE 0 +# undef MIN_SPAN_CACHE_SIZE +# undef MIN_SPAN_CACHE_RELEASE +# undef MAX_SPAN_CACHE_DIVISOR +# undef MIN_LARGE_SPAN_CACHE_SIZE +# undef MIN_LARGE_SPAN_CACHE_RELEASE +# undef MAX_LARGE_SPAN_CACHE_DIVISOR +#endif +#if !ENABLE_GLOBAL_CACHE +# undef MAX_GLOBAL_CACHE_MULTIPLIER +#endif +/// Platform and arch specifics #ifdef _MSC_VER # define ALIGNED_STRUCT(name, alignment) __declspec(align(alignment)) struct name # define FORCEINLINE __forceinline -# define TLS_MODEL -# define _Static_assert static_assert -# define _Thread_local __declspec(thread) # define atomic_thread_fence_acquire() //_ReadWriteBarrier() # define atomic_thread_fence_release() //_ReadWriteBarrier() # if ENABLE_VALIDATE_ARGS @@ -81,15 +94,15 @@ # endif # include #else +# include +# if defined(__APPLE__) && ENABLE_PRELOAD +# include +# endif # define ALIGNED_STRUCT(name, alignment) struct __attribute__((__aligned__(alignment))) name # define FORCEINLINE inline __attribute__((__always_inline__)) -# define TLS_MODEL __attribute__((tls_model("initial-exec"))) -# if !defined(__clang__) && defined(__GNUC__) -# define _Thread_local __thread -# endif # ifdef __arm__ -# define atomic_thread_fence_acquire() __asm volatile("dmb sy" ::: "memory") -# define atomic_thread_fence_release() __asm volatile("dmb st" ::: "memory") +# define atomic_thread_fence_acquire() __asm volatile("dmb ish" ::: "memory") +# define atomic_thread_fence_release() __asm volatile("dmb ishst" ::: "memory") # else # define atomic_thread_fence_acquire() //__asm volatile("" ::: "memory") # define atomic_thread_fence_release() //__asm volatile("" ::: "memory") @@ -104,7 +117,9 @@ #if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 ) # define PLATFORM_WINDOWS 1 +# define PLATFORM_POSIX 0 #else +# define PLATFORM_WINDOWS 0 # define PLATFORM_POSIX 1 #endif @@ -113,22 +128,26 @@ #include +#if ENABLE_GUARDS +# define MAGIC_GUARD 0xDEADBAAD +#endif + namespace tracy { -// Atomic access abstraction +/// Atomic access abstraction ALIGNED_STRUCT(atomic32_t, 4) { - int32_t nonatomic; + volatile int32_t nonatomic; }; typedef struct atomic32_t atomic32_t; ALIGNED_STRUCT(atomic64_t, 8) { - int64_t nonatomic; + volatile int64_t nonatomic; }; typedef struct atomic64_t atomic64_t; ALIGNED_STRUCT(atomicptr_t, 8) { - void* nonatomic; + volatile void* nonatomic; }; typedef struct atomicptr_t atomicptr_t; @@ -142,20 +161,6 @@ atomic_store32(atomic32_t* dst, int32_t val) { dst->nonatomic = val; } -#if PLATFORM_POSIX - -static FORCEINLINE void -atomic_store64(atomic64_t* dst, int64_t val) { - dst->nonatomic = val; -} - -static FORCEINLINE int64_t -atomic_exchange_and_add64(atomic64_t* dst, int64_t add) { - return __sync_fetch_and_add(&dst->nonatomic, add); -} - -#endif - static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { #ifdef _MSC_VER @@ -178,7 +183,7 @@ atomic_add32(atomic32_t* val, int32_t add) { static FORCEINLINE void* atomic_load_ptr(atomicptr_t* src) { - return src->nonatomic; + return (void*)((uintptr_t)src->nonatomic); } static FORCEINLINE void @@ -187,65 +192,49 @@ atomic_store_ptr(atomicptr_t* dst, void* val) { } static FORCEINLINE int -atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref); - -static void -thread_yield(void); - -// Preconfigured limits and sizes - -//! Memory page size -#define RP_PAGE_SIZE 4096 - -//! Granularity of all memory page spans for small & medium block allocations -#define SPAN_ADDRESS_GRANULARITY 65536 -//! Maximum size of a span of memory pages -#define SPAN_MAX_SIZE (SPAN_ADDRESS_GRANULARITY) -//! Mask for getting the start of a span of memory pages -#define SPAN_MASK (~((uintptr_t)SPAN_MAX_SIZE - 1)) -//! Maximum number of memory pages in a span -#define SPAN_MAX_PAGE_COUNT (SPAN_MAX_SIZE / RP_PAGE_SIZE) -//! Span size class granularity -#define SPAN_CLASS_GRANULARITY 4 -//! Number of size classes for spans -#define SPAN_CLASS_COUNT (SPAN_MAX_PAGE_COUNT / SPAN_CLASS_GRANULARITY) +atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { +#ifdef _MSC_VER +# if ARCH_64BIT + return (_InterlockedCompareExchange64((volatile long long*)&dst->nonatomic, + (long long)val, (long long)ref) == (long long)ref) ? 1 : 0; +# else + return (_InterlockedCompareExchange((volatile long*)&dst->nonatomic, + (long)val, (long)ref) == (long)ref) ? 1 : 0; +# endif +#else + return __sync_bool_compare_and_swap(&dst->nonatomic, ref, val); +#endif +} +/// Preconfigured limits and sizes //! Granularity of a small allocation block -#define SMALL_GRANULARITY 16 +#define SMALL_GRANULARITY 32 //! Small granularity shift count -#define SMALL_GRANULARITY_SHIFT 4 +#define SMALL_GRANULARITY_SHIFT 5 //! Number of small block size classes -#define SMALL_CLASS_COUNT (((RP_PAGE_SIZE - SPAN_HEADER_SIZE) >> 1) >> SMALL_GRANULARITY_SHIFT) +#define SMALL_CLASS_COUNT 63 //! Maximum size of a small block -#define SMALL_SIZE_LIMIT (SMALL_CLASS_COUNT * SMALL_GRANULARITY) - +#define SMALL_SIZE_LIMIT 2016 //! Granularity of a medium allocation block #define MEDIUM_GRANULARITY 512 -//! Medimum granularity shift count +//! Medium granularity shift count #define MEDIUM_GRANULARITY_SHIFT 9 //! Number of medium block size classes #define MEDIUM_CLASS_COUNT 60 -//! Maximum size of a medium block -#define MEDIUM_SIZE_LIMIT (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT) - SPAN_HEADER_SIZE) - //! Total number of small + medium size classes #define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT) - //! Number of large block size classes #define LARGE_CLASS_COUNT 32 -//! Maximum number of memory pages in a large block -#define LARGE_MAX_PAGES (SPAN_MAX_PAGE_COUNT * LARGE_CLASS_COUNT) +//! Maximum size of a medium block +#define MEDIUM_SIZE_LIMIT (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT) - SPAN_HEADER_SIZE) //! Maximum size of a large block -#define LARGE_SIZE_LIMIT ((LARGE_MAX_PAGES * RP_PAGE_SIZE) - SPAN_HEADER_SIZE) - -#define SPAN_LIST_LOCK_TOKEN ((void*)1) +#define LARGE_SIZE_LIMIT ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE) +//! Size of a span header +#define SPAN_HEADER_SIZE 32 #define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs)) #define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second)) -//! Size of a span header -#define SPAN_HEADER_SIZE 32 - #if ARCH_64BIT typedef int64_t offset_t; #else @@ -255,24 +244,35 @@ typedef uint32_t count_t; #if ENABLE_VALIDATE_ARGS //! Maximum allocation size to avoid integer overflow -#define MAX_ALLOC_SIZE (((size_t)-1) - RP_PAGE_SIZE) +#undef MAX_ALLOC_SIZE +#define MAX_ALLOC_SIZE (((size_t)-1) - _memory_span_size) #endif -// Data types - +/// Data types //! A memory heap, per thread typedef struct heap_t heap_t; //! Span of memory pages typedef struct span_t span_t; //! Size class definition typedef struct size_class_t size_class_t; -//! Span block bookkeeping +//! Span block bookkeeping typedef struct span_block_t span_block_t; +//! Span list bookkeeping +typedef struct span_list_t span_list_t; //! Span data union, usage depending on span state typedef union span_data_t span_data_t; //! Cache data typedef struct span_counter_t span_counter_t; +//! Global cache +typedef struct global_cache_t global_cache_t; +//! Flag indicating span is the first (master) span of a split superspan +#define SPAN_FLAG_MASTER 1 +//! Flag indicating span is a secondary (sub) span of a split superspan +#define SPAN_FLAG_SUBSPAN 2 + +//Alignment offset must match in both structures to keep the data when +//transitioning between being used for blocks and being part of a list struct span_block_t { //! Free list uint16_t free_list; @@ -280,22 +280,44 @@ struct span_block_t { uint16_t first_autolink; //! Free count uint16_t free_count; - //! Padding - uint16_t padding; + //! Alignment offset + uint16_t align_offset; +}; + +struct span_list_t { + //! List size + uint32_t size; + //! Unused in lists + uint16_t unused; + //! Alignment offset + uint16_t align_offset; }; union span_data_t { - //! Span data + //! Span data when used as blocks span_block_t block; - //! List size (used when span is part of a list) - uint32_t list_size; + //! Span data when used in lists + span_list_t list; }; +//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable, +//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single +//span or a super span. A super span can further be diviced into multiple spans (or this, super spans), where the first +//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans +//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire +//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released +//in the same call to release the virtual memory range, but individual subranges can be decommitted individually +//to reduce physical memory use). struct span_t { //! Heap ID atomic32_t heap_id; //! Size class - count_t size_class; + uint16_t size_class; + // TODO: If we could store remainder part of flags as an atomic counter, the entire check + // if master is owned by calling heap could be simplified to an atomic dec from any thread + // since remainder of a split super span only ever decreases, never increases + //! Flags and counters + uint16_t flags; //! Span data span_data_t data; //! Next span @@ -305,6 +327,7 @@ struct span_t { }; static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); +//Adaptive cache counter of a single superspan span count struct span_counter_t { //! Allocation high water mark uint32_t max_allocations; @@ -317,44 +340,45 @@ struct span_counter_t { struct heap_t { //! Heap ID int32_t id; - //! Deferred deallocation - atomicptr_t defer_deallocate; //! Free count for each size class active span span_block_t active_block[SIZE_CLASS_COUNT]; //! Active span for each size class span_t* active_span[SIZE_CLASS_COUNT]; - //! List of demi-used spans with free blocks for each size class (double linked list) + //! List of semi-used spans with free blocks for each size class (double linked list) span_t* size_cache[SIZE_CLASS_COUNT]; - //! List of free spans for each page count (single linked list) - span_t* span_cache[SPAN_CLASS_COUNT]; +#if ENABLE_THREAD_CACHE + //! List of free spans (single linked list) + span_t* span_cache[LARGE_CLASS_COUNT]; //! Allocation counters - span_counter_t span_counter[SPAN_CLASS_COUNT]; - //! List of free spans for each large class count (single linked list) - span_t* large_cache[LARGE_CLASS_COUNT]; - //! Allocation counters for large blocks - span_counter_t large_counter[LARGE_CLASS_COUNT]; + span_counter_t span_counter[LARGE_CLASS_COUNT]; +#endif + //! Mapped but unused spans + span_t* span_reserve; + //! Master span for mapped but unused spans + span_t* span_reserve_master; + //! Number of mapped but unused spans + size_t spans_reserved; + //! Deferred deallocation + atomicptr_t defer_deallocate; + //! Deferred unmaps + atomicptr_t defer_unmap; //! Next heap in id list heap_t* next_heap; //! Next heap in orphan list heap_t* next_orphan; + //! Memory pages alignment offset + size_t align_offset; #if ENABLE_STATISTICS - //! Number of bytes currently reqeusted in allocations - size_t requested; - //! Number of bytes current allocated - size_t allocated; //! Number of bytes transitioned thread -> global size_t thread_to_global; //! Number of bytes transitioned global -> thread size_t global_to_thread; #endif }; -static_assert(sizeof(heap_t) <= RP_PAGE_SIZE*2, "heap size mismatch"); struct size_class_t { //! Size of blocks in this class - uint16_t size; - //! Number of pages to allocate for a chunk - uint16_t page_count; + uint32_t size; //! Number of blocks in each chunk uint16_t block_count; //! Class index this class is merged with @@ -362,56 +386,112 @@ struct size_class_t { }; static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); +struct global_cache_t { + //! Cache list pointer + atomicptr_t cache; + //! Cache size + atomic32_t size; + //! ABA counter + atomic32_t counter; +}; + +/// Global data +//! Configuration +static rpmalloc_config_t _memory_config; +//! Memory page size +static size_t _memory_page_size; +//! Shift to divide by page size +static size_t _memory_page_size_shift; +//! Mask to get to start of a memory page +static size_t _memory_page_mask; +//! Granularity at which memory pages are mapped by OS +static size_t _memory_map_granularity; +//! Size of a span of memory pages +static size_t _memory_span_size; +//! Shift to divide by span size +static size_t _memory_span_size_shift; +//! Mask to get to start of a memory span +static uintptr_t _memory_span_mask; //! Global size classes static size_class_t _memory_size_class[SIZE_CLASS_COUNT]; - +//! Run-time size limit of medium blocks +static size_t _memory_medium_size_limit; //! Heap ID counter static atomic32_t _memory_heap_id; - -#ifdef PLATFORM_POSIX -//! Virtual memory address counter -static atomic64_t _memory_addr; +#if ENABLE_THREAD_CACHE +//! Adaptive cache max allocation count +static uint32_t _memory_max_allocation[LARGE_CLASS_COUNT]; #endif - +#if ENABLE_GLOBAL_CACHE //! Global span cache -static atomicptr_t _memory_span_cache[SPAN_CLASS_COUNT]; - -//! Global large cache -static atomicptr_t _memory_large_cache[LARGE_CLASS_COUNT]; - -//! Current thread heap -static _Thread_local heap_t* _memory_thread_heap TLS_MODEL; - +static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT]; +#endif //! All heaps static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE]; - //! Orphaned heaps static atomicptr_t _memory_orphan_heaps; - +//! Running orphan counter to avoid ABA issues in linked list +static atomic32_t _memory_orphan_counter; //! Active heap count static atomic32_t _memory_active_heaps; - -//! Adaptive cache max allocation count -static uint32_t _memory_max_allocation[SPAN_CLASS_COUNT]; - -//! Adaptive cache max allocation count -static uint32_t _memory_max_allocation_large[LARGE_CLASS_COUNT]; - #if ENABLE_STATISTICS -//! Total number of mapped memory pages +//! Total number of currently mapped memory pages static atomic32_t _mapped_pages; +//! Total number of currently lost spans +static atomic32_t _reserved_spans; //! Running counter of total number of mapped memory pages since start static atomic32_t _mapped_total; //! Running counter of total number of unmapped memory pages since start static atomic32_t _unmapped_total; #endif -static void* -_memory_map(size_t page_count); +#define MEMORY_UNUSED(x) (void)sizeof((x)) +//! Current thread heap +#if defined(__APPLE__) && ENABLE_PRELOAD +static pthread_key_t _memory_thread_heap; +#else +# ifdef _MSC_VER +# define _Thread_local __declspec(thread) +# define TLS_MODEL +# else +# define TLS_MODEL __attribute__((tls_model("initial-exec"))) +# if !defined(__clang__) && defined(__GNUC__) +# define _Thread_local __thread +# endif +# endif +static _Thread_local heap_t* _memory_thread_heap TLS_MODEL; +#endif + +//! Get the current thread heap +static FORCEINLINE heap_t* +get_thread_heap(void) { +#if defined(__APPLE__) && ENABLE_PRELOAD + return pthread_getspecific(_memory_thread_heap); +#else + return _memory_thread_heap; +#endif +} + +//! Set the current thread heap static void -_memory_unmap(void* ptr, size_t page_count); +set_thread_heap(heap_t* heap) { +#if defined(__APPLE__) && ENABLE_PRELOAD + pthread_setspecific(_memory_thread_heap, heap); +#else + _memory_thread_heap = heap; +#endif +} +//! Default implementation to map more virtual memory +static void* +_memory_map_os(size_t size, size_t* offset); + +//! Default implementation to unmap virtual memory +static void +_memory_unmap_os(void* address, size_t size, size_t offset, int release); + +//! Deallocate any deferred blocks and check for the given size class static int _memory_deallocate_deferred(heap_t* heap, size_t size_class); @@ -425,212 +505,541 @@ _memory_heap_lookup(int32_t id) { return heap; } -//! Get the span size class from page count -static size_t -_span_class_from_page_count(size_t page_count) { - assert((page_count > 0) && (page_count <= 16)); - return ((page_count + SPAN_CLASS_GRANULARITY - 1) / SPAN_CLASS_GRANULARITY) - 1; -} +#if ENABLE_THREAD_CACHE //! Increase an allocation counter static void -_memory_counter_increase(span_counter_t* counter, uint32_t* global_counter) { +_memory_counter_increase(span_counter_t* counter, uint32_t* global_counter, size_t span_count) { if (++counter->current_allocations > counter->max_allocations) { counter->max_allocations = counter->current_allocations; -#if MAX_SPAN_CACHE_DIVISOR > 0 - counter->cache_limit = counter->max_allocations / MAX_SPAN_CACHE_DIVISOR; + const uint32_t cache_limit_max = (uint32_t)_memory_span_size - 2; +#if !ENABLE_UNLIMITED_CACHE + counter->cache_limit = counter->max_allocations / ((span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : MAX_LARGE_SPAN_CACHE_DIVISOR); + const uint32_t cache_limit_min = (span_count == 1) ? (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE) : (MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE); + if (counter->cache_limit < cache_limit_min) + counter->cache_limit = cache_limit_min; + if (counter->cache_limit > cache_limit_max) + counter->cache_limit = cache_limit_max; +#else + counter->cache_limit = cache_limit_max; #endif if (counter->max_allocations > *global_counter) *global_counter = counter->max_allocations; } } -//! Insert the given list of memory page spans in the global cache for small/medium blocks -static void -_memory_global_cache_insert(span_t* first_span, size_t list_size, size_t page_count) { - assert((list_size == 1) || (first_span->next_span != 0)); -#if MAX_SPAN_CACHE_DIVISOR > 0 - while (1) { - size_t span_class_idx = _span_class_from_page_count(page_count); - void* global_span_ptr = atomic_load_ptr(&_memory_span_cache[span_class_idx]); - if (global_span_ptr != SPAN_LIST_LOCK_TOKEN) { - uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~SPAN_MASK; - span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK)); - -#ifdef GLOBAL_SPAN_CACHE_MULTIPLIER - size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation[span_class_idx] / MAX_SPAN_CACHE_DIVISOR); - if ((global_list_size >= cache_limit) && (global_list_size > MIN_SPAN_CACHE_SIZE)) - break; +#else +# define _memory_counter_increase(counter, global_counter, span_count) do {} while (0) #endif - //We only have 16 bits for size of list, avoid overflow - if ((global_list_size + list_size) > 0xFFFF) - break; - //Use prev_span as skip pointer over this sublist range of spans - first_span->data.list_size = (uint32_t)list_size; - first_span->prev_span = global_span; - - //Insert sublist into global cache - global_list_size += list_size; - void* first_span_ptr = (void*)((uintptr_t)first_span | global_list_size); - if (atomic_cas_ptr(&_memory_span_cache[span_class_idx], first_span_ptr, global_span_ptr)) - return; - } - else { - //Atomic operation failed, yield timeslice and retry - thread_yield(); - atomic_thread_fence_acquire(); - } - } +#if ENABLE_STATISTICS +# define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value)) +# define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value)) +#else +# define _memory_statistics_add(atomic_counter, value) do {} while(0) +# define _memory_statistics_sub(atomic_counter, value) do {} while(0) #endif - //Global cache full, release pages - for (size_t ispan = 0; ispan < list_size; ++ispan) { - assert(first_span); - span_t* next_span = first_span->next_span; - _memory_unmap(first_span, page_count); - first_span = next_span; - } + +//! Map more virtual memory +static void* +_memory_map(size_t size, size_t* offset) { + assert(!(size % _memory_page_size)); + _memory_statistics_add(&_mapped_pages, (size >> _memory_page_size_shift)); + _memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift)); + return _memory_config.memory_map(size, offset); } -//! Extract a number of memory page spans from the global cache for small/medium blocks +//! Unmap virtual memory +static void +_memory_unmap(void* address, size_t size, size_t offset, int release) { + assert((size < _memory_span_size) || !((uintptr_t)address & ~_memory_span_mask)); + assert(!(size % _memory_page_size)); + _memory_statistics_sub(&_mapped_pages, (size >> _memory_page_size_shift)); + _memory_statistics_add(&_unmapped_total, (size >> _memory_page_size_shift)); + _memory_config.memory_unmap(address, size, offset, release); +} + +//! Make flags field in a span from flags, remainder/distance and count +#define SPAN_MAKE_FLAGS(flags, remdist, count) ((uint16_t)((flags) | ((uint16_t)((remdist) - 1) << 2) | ((uint16_t)((count) - 1) << 9))); assert((flags) < 4); assert((remdist) && (remdist) < 128); assert((count) && (count) < 128) +//! Check if span has any of the given flags +#define SPAN_HAS_FLAG(flags, flag) ((flags) & (flag)) +//! Get the distance from flags field +#define SPAN_DISTANCE(flags) (1 + (((flags) >> 2) & 0x7f)) +//! Get the remainder from flags field +#define SPAN_REMAINS(flags) (1 + (((flags) >> 2) & 0x7f)) +//! Get the count from flags field +#define SPAN_COUNT(flags) (1 + (((flags) >> 9) & 0x7f)) +//! Set the remainder in the flags field (MUST be done from the owner heap thread) +#define SPAN_SET_REMAINS(flags, remains) flags = ((uint16_t)(((flags) & 0xfe03) | ((uint16_t)((remains) - 1) << 2))); assert((remains) < 128) + +//! Resize the given super span to the given count of spans, store the remainder in the heap reserved spans fields +static void +_memory_set_span_remainder_as_reserved(heap_t* heap, span_t* span, size_t use_count) { + size_t current_count = SPAN_COUNT(span->flags); + + assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); + assert((current_count > 1) && (current_count < 127)); + assert(!heap->spans_reserved); + assert(SPAN_COUNT(span->flags) == current_count); + assert(current_count > use_count); + + heap->span_reserve = (span_t*)pointer_offset(span, use_count * _memory_span_size); + heap->spans_reserved = current_count - use_count; + if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) { + //We must store the heap id before setting as master, to force unmaps to defer to this heap thread + atomic_store32(&span->heap_id, heap->id); + atomic_thread_fence_release(); + heap->span_reserve_master = span; + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count); + _memory_statistics_add(&_reserved_spans, current_count); + } + else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) { + //Only owner heap thread can modify a master span + assert(atomic_load32(&span->heap_id) == heap->id); + uint16_t remains = SPAN_REMAINS(span->flags); + assert(remains >= current_count); + heap->span_reserve_master = span; + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, use_count); + } + else { //SPAN_FLAG_SUBSPAN + //Resizing a subspan is a safe operation in any thread + uint16_t distance = SPAN_DISTANCE(span->flags); + span_t* master = (span_t*)pointer_offset(span, -(int)distance * (int)_memory_span_size); + heap->span_reserve_master = master; + assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER)); + assert(SPAN_REMAINS(master->flags) >= current_count); + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, use_count); + } + assert((SPAN_COUNT(span->flags) + heap->spans_reserved) == current_count); +} + +//! Map in memory pages for the given number of spans (or use previously reserved pages) static span_t* -_memory_global_cache_extract(size_t page_count) { - span_t* span = 0; - size_t span_class_idx = _span_class_from_page_count(page_count); - atomicptr_t* cache = &_memory_span_cache[span_class_idx]; - atomic_thread_fence_acquire(); - void* global_span_ptr = atomic_load_ptr(cache); - while (global_span_ptr) { - if ((global_span_ptr != SPAN_LIST_LOCK_TOKEN) && - atomic_cas_ptr(cache, SPAN_LIST_LOCK_TOKEN, global_span_ptr)) { - //Grab a number of thread cache spans, using the skip span pointer - //stored in prev_span to quickly skip ahead in the list to get the new head - uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~SPAN_MASK; - span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK)); - assert((span->data.list_size == 1) || (span->next_span != 0)); - - span_t* new_global_span = span->prev_span; - global_span_count -= span->data.list_size; - - //Set new head of global cache list - void* new_cache_head = global_span_count ? - ((void*)((uintptr_t)new_global_span | global_span_count)) : - 0; - atomic_store_ptr(cache, new_cache_head); - atomic_thread_fence_release(); - break; - } - - //List busy, yield timeslice and retry - thread_yield(); - atomic_thread_fence_acquire(); - global_span_ptr = atomic_load_ptr(cache); +_memory_map_spans(heap_t* heap, size_t span_count) { + if (span_count <= heap->spans_reserved) { + span_t* span = heap->span_reserve; + heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size); + heap->spans_reserved -= span_count; + //Declare the span to be a subspan with given distance from master span + uint16_t distance = (uint16_t)((uintptr_t)pointer_diff(span, heap->span_reserve_master) >> _memory_span_size_shift); + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, span_count); + span->data.block.align_offset = 0; + return span; } + //We cannot request extra spans if we already have some (but not enough) pending reserved spans + size_t request_spans = (heap->spans_reserved || (span_count > _memory_config.span_map_count)) ? span_count : _memory_config.span_map_count; + size_t align_offset = 0; + span_t* span = (span_t*)_memory_map(request_spans * _memory_span_size, &align_offset); + span->flags = SPAN_MAKE_FLAGS(0, request_spans, request_spans); + span->data.block.align_offset = (uint16_t)align_offset; + if (request_spans > span_count) { + //We have extra spans, store them as reserved spans in heap + _memory_set_span_remainder_as_reserved(heap, span, span_count); + } return span; } -/*! Insert the given list of memory page spans in the global cache for large blocks, - similar to _memory_global_cache_insert */ +//! Defer unmapping of the given span to the owner heap +static int +_memory_unmap_defer(int32_t heap_id, span_t* span) { + //Get the heap and link in pointer in list of deferred operations + heap_t* heap = _memory_heap_lookup(heap_id); + if (!heap) + return 0; + atomic_store32(&span->heap_id, heap_id); + void* last_ptr; + do { + last_ptr = atomic_load_ptr(&heap->defer_unmap); + span->next_span = (span_t*)last_ptr; + } while (!atomic_cas_ptr(&heap->defer_unmap, span, last_ptr)); + return 1; +} + +//! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings) static void -_memory_global_cache_large_insert(span_t* span_list, size_t list_size, size_t span_count) { - assert((list_size == 1) || (span_list->next_span != 0)); - assert(span_list->size_class == (SIZE_CLASS_COUNT + (span_count - 1))); -#if MAX_SPAN_CACHE_DIVISOR > 0 - atomicptr_t* cache = &_memory_large_cache[span_count - 1]; - while (1) { - void* global_span_ptr = atomic_load_ptr(cache); - if (global_span_ptr != SPAN_LIST_LOCK_TOKEN) { - uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~SPAN_MASK; - span_t* global_span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK)); - -#ifdef GLOBAL_SPAN_CACHE_MULTIPLIER - size_t cache_limit = GLOBAL_SPAN_CACHE_MULTIPLIER * (_memory_max_allocation_large[span_count-1] / MAX_SPAN_CACHE_DIVISOR); - if ((global_list_size >= cache_limit) && (global_list_size > MIN_SPAN_CACHE_SIZE)) - break; -#endif - if ((global_list_size + list_size) > 0xFFFF) - break; - - span_list->data.list_size = (uint32_t)list_size; - span_list->prev_span = global_span; - - global_list_size += list_size; - void* new_global_span_ptr = (void*)((uintptr_t)span_list | global_list_size); - if (atomic_cas_ptr(cache, new_global_span_ptr, global_span_ptr)) - return; - } - else { - thread_yield(); - atomic_thread_fence_acquire(); - } +_memory_unmap_span(heap_t* heap, span_t* span) { + size_t span_count = SPAN_COUNT(span->flags); + assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); + //A plain run of spans can be unmapped directly + if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) { + _memory_unmap(span, span_count * _memory_span_size, span->data.list.align_offset, 1); + return; } -#endif - //Global cache full, release spans - for (size_t ispan = 0; ispan < list_size; ++ispan) { - assert(span_list); - span_t* next_span = span_list->next_span; - _memory_unmap(span_list, span_count * SPAN_MAX_PAGE_COUNT); - span_list = next_span; + + uint32_t is_master = SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER); + span_t* master = is_master ? span : (span_t*)(pointer_offset(span, -(int)SPAN_DISTANCE(span->flags) * (int)_memory_span_size)); + + assert(is_master || SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); + assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER)); + + //Check if we own the master span, otherwise defer (only owner of master span can modify remainder field) + int32_t master_heap_id = atomic_load32(&master->heap_id); + if (heap && (master_heap_id != heap->id)) { + if (_memory_unmap_defer(master_heap_id, span)) + return; + } + if (!is_master) { + //Directly unmap subspans + assert(span->data.list.align_offset == 0); + _memory_unmap(span, span_count * _memory_span_size, 0, 0); + _memory_statistics_sub(&_reserved_spans, span_count); + } + else { + //Special double flag to denote an unmapped master + //It must be kept in memory since span header must be used + span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN; + } + //We are in owner thread of the master span + uint32_t remains = SPAN_REMAINS(master->flags); + assert(remains >= span_count); + remains = ((uint32_t)span_count >= remains) ? 0 : (remains - (uint32_t)span_count); + if (!remains) { + //Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span + assert(SPAN_HAS_FLAG(master->flags, SPAN_FLAG_MASTER) && SPAN_HAS_FLAG(master->flags, SPAN_FLAG_SUBSPAN)); + span_count = SPAN_COUNT(master->flags); + _memory_unmap(master, span_count * _memory_span_size, master->data.list.align_offset, 1); + _memory_statistics_sub(&_reserved_spans, span_count); + } + else { + //Set remaining spans + SPAN_SET_REMAINS(master->flags, remains); } } -/*! Extract a number of memory page spans from the global cache for large blocks, - similar to _memory_global_cache_extract */ +//! Process pending deferred cross-thread unmaps static span_t* -_memory_global_cache_large_extract(size_t span_count) { - span_t* span = 0; - atomicptr_t* cache = &_memory_large_cache[span_count - 1]; +_memory_unmap_deferred(heap_t* heap, size_t wanted_count) { + //Grab the current list of deferred unmaps atomic_thread_fence_acquire(); - void* global_span_ptr = atomic_load_ptr(cache); - while (global_span_ptr) { - if ((global_span_ptr != SPAN_LIST_LOCK_TOKEN) && - atomic_cas_ptr(cache, SPAN_LIST_LOCK_TOKEN, global_span_ptr)) { - uintptr_t global_list_size = (uintptr_t)global_span_ptr & ~SPAN_MASK; - span = (span_t*)((void*)((uintptr_t)global_span_ptr & SPAN_MASK)); - assert((span->data.list_size == 1) || (span->next_span != 0)); - assert(span->size_class == (SIZE_CLASS_COUNT + (span_count - 1))); + span_t* span = (span_t*)atomic_load_ptr(&heap->defer_unmap); + if (!span || !atomic_cas_ptr(&heap->defer_unmap, 0, span)) + return 0; + span_t* found_span = 0; + do { + //Verify that we own the master span, otherwise re-defer to owner + void* next = span->next_span; + if (!found_span && SPAN_COUNT(span->flags) == wanted_count) { + assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); + found_span = span; + } + else { + uint32_t is_master = SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER); + span_t* master = is_master ? span : (span_t*)(pointer_offset(span, -(int)SPAN_DISTANCE(span->flags) * (int)_memory_span_size)); + int32_t master_heap_id = atomic_load32(&master->heap_id); + if ((atomic_load32(&span->heap_id) == master_heap_id) || + !_memory_unmap_defer(master_heap_id, span)) { + //We own the master span (or heap merged and abandoned) + _memory_unmap_span(heap, span); + } + } + span = (span_t*)next; + } while (span); + return found_span; +} - span_t* new_global_span = span->prev_span; - global_list_size -= span->data.list_size; +//! Unmap a single linked list of spans +static void +_memory_unmap_span_list(heap_t* heap, span_t* span) { + size_t list_size = span->data.list.size; + for (size_t ispan = 0; ispan < list_size; ++ispan) { + span_t* next_span = span->next_span; + _memory_unmap_span(heap, span); + span = next_span; + } + assert(!span); +} - void* new_global_span_ptr = global_list_size ? - ((void*)((uintptr_t)new_global_span | global_list_size)) : - 0; - atomic_store_ptr(cache, new_global_span_ptr); - atomic_thread_fence_release(); +#if ENABLE_THREAD_CACHE + +//! Split a super span in two +static span_t* +_memory_span_split(heap_t* heap, span_t* span, size_t use_count) { + uint16_t distance = 0; + size_t current_count = SPAN_COUNT(span->flags); + assert(current_count > use_count); + assert(!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER) || !SPAN_HAS_FLAG(span->flags, SPAN_FLAG_SUBSPAN)); + if (!SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN)) { + //Must store heap in master span before use, to avoid issues when unmapping subspans + atomic_store32(&span->heap_id, heap->id); + atomic_thread_fence_release(); + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, current_count, use_count); + _memory_statistics_add(&_reserved_spans, current_count); + } + else if (SPAN_HAS_FLAG(span->flags, SPAN_FLAG_MASTER)) { + //Only valid to call on master span if we own it + assert(atomic_load32(&span->heap_id) == heap->id); + uint16_t remains = SPAN_REMAINS(span->flags); + assert(remains >= current_count); + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_MASTER, remains, use_count); + } + else { //SPAN_FLAG_SUBSPAN + distance = SPAN_DISTANCE(span->flags); + span->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance, use_count); + } + //Setup remainder as a subspan + span_t* subspan = (span_t*)pointer_offset(span, use_count * _memory_span_size); + subspan->flags = SPAN_MAKE_FLAGS(SPAN_FLAG_SUBSPAN, distance + use_count, current_count - use_count); + subspan->data.list.align_offset = 0; + return subspan; +} + +//! Add span to head of single linked span list +static size_t +_memory_span_list_push(span_t** head, span_t* span) { + span->next_span = *head; + if (*head) + span->data.list.size = (*head)->data.list.size + 1; + else + span->data.list.size = 1; + *head = span; + return span->data.list.size; +} + +//! Remove span from head of single linked span list, returns the new list head +static span_t* +_memory_span_list_pop(span_t** head) { + span_t* span = *head; + span_t* next_span = 0; + if (span->data.list.size > 1) { + next_span = span->next_span; + assert(next_span); + next_span->data.list.size = span->data.list.size - 1; + } + *head = next_span; + return span; +} + +//! Split a single linked span list +static span_t* +_memory_span_list_split(span_t* span, size_t limit) { + span_t* next = 0; + if (limit < 2) + limit = 2; + if (span->data.list.size > limit) { + count_t list_size = 1; + span_t* last = span; + next = span->next_span; + while (list_size < limit) { + last = next; + next = next->next_span; + ++list_size; + } + last->next_span = 0; + assert(next); + next->data.list.size = span->data.list.size - list_size; + span->data.list.size = list_size; + span->prev_span = 0; + } + return next; +} + +#endif + +//! Add a span to a double linked list +static void +_memory_span_list_doublelink_add(span_t** head, span_t* span) { + if (*head) { + (*head)->prev_span = span; + span->next_span = *head; + } + else { + span->next_span = 0; + } + *head = span; +} + +//! Remove a span from a double linked list +static void +_memory_span_list_doublelink_remove(span_t** head, span_t* span) { + if (*head == span) { + *head = span->next_span; + } + else { + span_t* next_span = span->next_span; + span_t* prev_span = span->prev_span; + if (next_span) + next_span->prev_span = prev_span; + prev_span->next_span = next_span; + } +} + +#if ENABLE_GLOBAL_CACHE + +//! Insert the given list of memory page spans in the global cache +static void +_memory_cache_insert(heap_t* heap, global_cache_t* cache, span_t* span, size_t cache_limit) { + assert((span->data.list.size == 1) || (span->next_span != 0)); + int32_t list_size = (int32_t)span->data.list.size; + //Unmap if cache has reached the limit + if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) { + _memory_unmap_span_list(heap, span); + atomic_add32(&cache->size, -list_size); + return; + } + void* current_cache, *new_cache; + do { + current_cache = atomic_load_ptr(&cache->cache); + span->prev_span = (span_t*)(void*)((uintptr_t)current_cache & _memory_span_mask); + new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); + } while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache)); +} + +//! Extract a number of memory page spans from the global cache +static span_t* +_memory_cache_extract(global_cache_t* cache) { + uintptr_t span_ptr; + do { + void* global_span = atomic_load_ptr(&cache->cache); + span_ptr = (uintptr_t)global_span & _memory_span_mask; + if (span_ptr) { + span_t* span = (span_t*)(void*)span_ptr; + //By accessing the span ptr before it is swapped out of list we assume that a contending thread + //does not manage to traverse the span to being unmapped before we access it + void* new_cache = (void*)((uintptr_t)span->prev_span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); + if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) { + atomic_add32(&cache->size, -(int32_t)span->data.list.size); + return span; + } + } + } while (span_ptr); + return 0; +} + +//! Finalize a global cache, only valid from allocator finalization (not thread safe) +static void +_memory_cache_finalize(global_cache_t* cache) { + void* current_cache = atomic_load_ptr(&cache->cache); + span_t* span = (span_t*)(void*)((uintptr_t)current_cache & _memory_span_mask); + while (span) { + span_t* skip_span = (span_t*)(void*)((uintptr_t)span->prev_span & _memory_span_mask); + atomic_add32(&cache->size, -(int32_t)span->data.list.size); + _memory_unmap_span_list(0, span); + span = skip_span; + } + assert(!atomic_load32(&cache->size)); + atomic_store_ptr(&cache->cache, 0); + atomic_store32(&cache->size, 0); +} + +//! Insert the given list of memory page spans in the global cache +static void +_memory_global_cache_insert(heap_t* heap, span_t* span) { + //Calculate adaptive limits + size_t span_count = SPAN_COUNT(span->flags); + const size_t cache_divisor = (span_count == 1) ? MAX_SPAN_CACHE_DIVISOR : (MAX_LARGE_SPAN_CACHE_DIVISOR * span_count * 2); + const size_t cache_limit = (MAX_GLOBAL_CACHE_MULTIPLIER * _memory_max_allocation[span_count - 1]) / cache_divisor; + const size_t cache_limit_min = MAX_GLOBAL_CACHE_MULTIPLIER * (span_count == 1 ? MIN_SPAN_CACHE_SIZE : MIN_LARGE_SPAN_CACHE_SIZE); + _memory_cache_insert(heap, &_memory_span_cache[span_count - 1], span, cache_limit > cache_limit_min ? cache_limit : cache_limit_min); +} + +//! Extract a number of memory page spans from the global cache for large blocks +static span_t* +_memory_global_cache_extract(size_t span_count) { + span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]); + assert(!span || (SPAN_COUNT(span->flags) == span_count)); + return span; +} + +#endif + +//! Insert a single span into thread heap cache, releasing to global cache if overflow +static void +_memory_heap_cache_insert(heap_t* heap, span_t* span) { +#if ENABLE_THREAD_CACHE + size_t span_count = SPAN_COUNT(span->flags); + size_t idx = span_count - 1; + if (_memory_span_list_push(&heap->span_cache[idx], span) <= heap->span_counter[idx].cache_limit) + return; + heap->span_cache[idx] = _memory_span_list_split(span, heap->span_counter[idx].cache_limit); + assert(span->data.list.size == heap->span_counter[idx].cache_limit); +#if ENABLE_STATISTICS + heap->thread_to_global += (size_t)span->data.list.size * span_count * _memory_span_size; +#endif +#if ENABLE_GLOBAL_CACHE + _memory_global_cache_insert(heap, span); +#else + _memory_unmap_span_list(heap, span); +#endif +#else + _memory_unmap_span(heap, span); +#endif +} + +//! Extract the given number of spans from the different cache levels +static span_t* +_memory_heap_cache_extract(heap_t* heap, size_t span_count) { +#if ENABLE_THREAD_CACHE + size_t idx = span_count - 1; + //Step 1: check thread cache + if (heap->span_cache[idx]) + return _memory_span_list_pop(&heap->span_cache[idx]); +#endif + //Step 2: Check reserved spans + if (heap->spans_reserved >= span_count) + return _memory_map_spans(heap, span_count); + //Step 3: Try processing deferred unmappings + span_t* span = _memory_unmap_deferred(heap, span_count); + if (span) + return span; +#if ENABLE_THREAD_CACHE + //Step 4: Check larger super spans and split if we find one + for (++idx; idx < LARGE_CLASS_COUNT; ++idx) { + if (heap->span_cache[idx]) { + span = _memory_span_list_pop(&heap->span_cache[idx]); break; } - - thread_yield(); - atomic_thread_fence_acquire(); - global_span_ptr = atomic_load_ptr(cache); } - return span; + if (span) { + //Mark the span as owned by this heap before splitting + size_t got_count = SPAN_COUNT(span->flags); + assert(got_count > span_count); + atomic_store32(&span->heap_id, heap->id); + atomic_thread_fence_release(); + + //Split the span and store as reserved if no previously reserved spans, or in thread cache otherwise + span_t* subspan = _memory_span_split(heap, span, span_count); + assert((SPAN_COUNT(span->flags) + SPAN_COUNT(subspan->flags)) == got_count); + assert(SPAN_COUNT(span->flags) == span_count); + if (!heap->spans_reserved) { + heap->spans_reserved = got_count - span_count; + heap->span_reserve = subspan; + heap->span_reserve_master = (span_t*)pointer_offset(subspan, -(int32_t)SPAN_DISTANCE(subspan->flags) * (int32_t)_memory_span_size); + } + else { + _memory_heap_cache_insert(heap, subspan); + } + return span; + } +#if ENABLE_GLOBAL_CACHE + //Step 5: Extract from global cache + idx = span_count - 1; + heap->span_cache[idx] = _memory_global_cache_extract(span_count); + if (heap->span_cache[idx]) { +#if ENABLE_STATISTICS + heap->global_to_thread += (size_t)heap->span_cache[idx]->data.list.size * span_count * _memory_span_size; +#endif + return _memory_span_list_pop(&heap->span_cache[idx]); + } +#endif +#endif + return 0; } //! Allocate a small/medium sized memory block from the given heap static void* _memory_allocate_from_heap(heap_t* heap, size_t size) { -#if ENABLE_STATISTICS - //For statistics we need to store the requested size in the memory block - size += sizeof(size_t); -#endif - //Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes) - const size_t class_idx = _memory_size_class[(size <= SMALL_SIZE_LIMIT) ? - ((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT) - 1 : - SMALL_CLASS_COUNT + ((size - SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY - 1)) >> MEDIUM_GRANULARITY_SHIFT) - 1].class_idx; + const size_t base_idx = (size <= SMALL_SIZE_LIMIT) ? + ((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT) : + SMALL_CLASS_COUNT + ((size - SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY - 1)) >> MEDIUM_GRANULARITY_SHIFT); + assert(!base_idx || ((base_idx - 1) < SIZE_CLASS_COUNT)); + const size_t class_idx = _memory_size_class[base_idx ? (base_idx - 1) : 0].class_idx; span_block_t* active_block = heap->active_block + class_idx; size_class_t* size_class = _memory_size_class + class_idx; const count_t class_size = size_class->size; -#if ENABLE_STATISTICS - heap->allocated += class_size; - heap->requested += size; -#endif - //Step 1: Try to get a block from the currently active span. The span block bookkeeping // data for the active span is stored in the heap for faster access use_active: @@ -661,16 +1070,11 @@ use_active: assert(active_block->free_list < size_class->block_count); } -#if ENABLE_STATISTICS - //Store the requested size for statistics - *(size_t*)pointer_offset(block, class_size - sizeof(size_t)) = size; -#endif - return block; } //Step 2: No active span, try executing deferred deallocations and try again if there - // was at least one of the reqeusted size class + // was at least one of the requested size class if (_memory_deallocate_deferred(heap, class_idx)) { if (active_block->free_count) goto use_active; @@ -683,46 +1087,29 @@ use_active: span_t* span = heap->size_cache[class_idx]; *active_block = span->data.block; assert(active_block->free_count > 0); - span_t* next_span = span->next_span; - heap->size_cache[class_idx] = next_span; + heap->size_cache[class_idx] = span->next_span; heap->active_span[class_idx] = span; + + //Mark span as owned by this heap + atomic_store32(&span->heap_id, heap->id); + atomic_thread_fence_release(); + goto use_active; } - //Step 4: No semi-used span available, try grab a span from the thread cache - size_t span_class_idx = _span_class_from_page_count(size_class->page_count); - span_t* span = heap->span_cache[span_class_idx]; + //Step 4: Find a span in one of the cache levels + span_t* span = _memory_heap_cache_extract(heap, 1); if (!span) { - //Step 5: No span available in the thread cache, try grab a list of spans from the global cache - span = _memory_global_cache_extract(size_class->page_count); -#if ENABLE_STATISTICS - if (span) - heap->global_to_thread += (size_t)span->data.list_size * size_class->page_count * RP_PAGE_SIZE; -#endif - } - if (span) { - if (span->data.list_size > 1) { - //We got a list of spans, we will use first as active and store remainder in thread cache - span_t* next_span = span->next_span; - assert(next_span); - next_span->data.list_size = span->data.list_size - 1; - heap->span_cache[span_class_idx] = next_span; - } - else { - heap->span_cache[span_class_idx] = 0; - } - } - else { - //Step 6: All caches empty, map in new memory pages - span = (span_t*)_memory_map(size_class->page_count); + //Step 5: Map in more virtual memory + span = _memory_map_spans(heap, 1); } //Mark span as owned by this heap and set base data + assert(SPAN_COUNT(span->flags) == 1); + span->size_class = (uint16_t)class_idx; atomic_store32(&span->heap_id, heap->id); atomic_thread_fence_release(); - span->size_class = (count_t)class_idx; - //If we only have one block we will grab it, otherwise //set span as new span to use for next allocation if (size_class->block_count > 1) { @@ -738,12 +1125,7 @@ use_active: } //Track counters - _memory_counter_increase(&heap->span_counter[span_class_idx], &_memory_max_allocation[span_class_idx]); - -#if ENABLE_STATISTICS - //Store the requested size for statistics - *(size_t*)pointer_offset(span, SPAN_HEADER_SIZE + class_size - sizeof(size_t)) = size; -#endif + _memory_counter_increase(&heap->span_counter[0], &_memory_max_allocation[0], 1); //Return first block if memory page span return pointer_offset(span, SPAN_HEADER_SIZE); @@ -753,116 +1135,35 @@ use_active: static void* _memory_allocate_large_from_heap(heap_t* heap, size_t size) { //Calculate number of needed max sized spans (including header) + //Since this function is never called if size > LARGE_SIZE_LIMIT + //the span_count is guaranteed to be <= LARGE_CLASS_COUNT size += SPAN_HEADER_SIZE; - size_t num_spans = size / SPAN_MAX_SIZE; - if (size % SPAN_MAX_SIZE) - ++num_spans; - size_t idx = num_spans - 1; + size_t span_count = size >> _memory_span_size_shift; + if (size & (_memory_span_size - 1)) + ++span_count; + size_t idx = span_count - 1; - if (!idx) { - size_t span_class_idx = _span_class_from_page_count(SPAN_MAX_PAGE_COUNT); - span_t* span = heap->span_cache[span_class_idx]; - if (!span) { - _memory_deallocate_deferred(heap, 0); - span = heap->span_cache[span_class_idx]; - } - if (!span) { - //Step 5: No span available in the thread cache, try grab a list of spans from the global cache - span = _memory_global_cache_extract(SPAN_MAX_PAGE_COUNT); -#if ENABLE_STATISTICS - if (span) - heap->global_to_thread += (size_t)span->data.list_size * SPAN_MAX_PAGE_COUNT * RP_PAGE_SIZE; +#if ENABLE_THREAD_CACHE + if (!heap->span_cache[idx]) + _memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx); +#else + _memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx); #endif - } - if (span) { - if (span->data.list_size > 1) { - //We got a list of spans, we will use first as active and store remainder in thread cache - span_t* next_span = span->next_span; - assert(next_span); - next_span->data.list_size = span->data.list_size - 1; - heap->span_cache[span_class_idx] = next_span; - } - else { - heap->span_cache[span_class_idx] = 0; - } - } - else { - //Step 6: All caches empty, map in new memory pages - span = (span_t*)_memory_map(SPAN_MAX_PAGE_COUNT); - } - - //Mark span as owned by this heap and set base data - atomic_store32(&span->heap_id, heap->id); - atomic_thread_fence_release(); - - span->size_class = SIZE_CLASS_COUNT; - - //Track counters - _memory_counter_increase(&heap->span_counter[span_class_idx], &_memory_max_allocation[span_class_idx]); - - return pointer_offset(span, SPAN_HEADER_SIZE); + //Step 1: Find span in one of the cache levels + span_t* span = _memory_heap_cache_extract(heap, span_count); + if (!span) { + //Step 2: Map in more virtual memory + span = _memory_map_spans(heap, span_count); } -use_cache: - //Step 1: Check if cache for this large size class (or the following, unless first class) has a span - while (!heap->large_cache[idx] && (idx < LARGE_CLASS_COUNT) && (idx < num_spans + 1)) - ++idx; - span_t* span = heap->large_cache[idx]; - if (span) { - //Happy path, use from cache - if (span->data.list_size > 1) { - span_t* new_head = span->next_span; - assert(new_head); - new_head->data.list_size = span->data.list_size - 1; - heap->large_cache[idx] = new_head; - } - else { - heap->large_cache[idx] = 0; - } - - span->size_class = SIZE_CLASS_COUNT + (count_t)idx; - - //Increase counter - _memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx]); - - return pointer_offset(span, SPAN_HEADER_SIZE); - } - - //Restore index, we're back to smallest fitting span count - idx = num_spans - 1; - - //Step 2: Process deferred deallocation - if (_memory_deallocate_deferred(heap, SIZE_CLASS_COUNT + idx)) - goto use_cache; - assert(!heap->large_cache[idx]); - - //Step 3: Extract a list of spans from global cache - span = _memory_global_cache_large_extract(num_spans); - if (span) { -#if ENABLE_STATISTICS - heap->global_to_thread += (size_t)span->data.list_size * num_spans * SPAN_MAX_SIZE; -#endif - //We got a list from global cache, store remainder in thread cache - if (span->data.list_size > 1) { - span_t* new_head = span->next_span; - assert(new_head); - new_head->prev_span = 0; - new_head->data.list_size = span->data.list_size - 1; - heap->large_cache[idx] = new_head; - } - } - else { - //Step 4: Map in more memory pages - span = (span_t*)_memory_map(num_spans * SPAN_MAX_PAGE_COUNT); - } - //Mark span as owned by this heap + //Mark span as owned by this heap and set base data + assert(SPAN_COUNT(span->flags) == span_count); + span->size_class = (uint16_t)(SIZE_CLASS_COUNT + idx); atomic_store32(&span->heap_id, heap->id); atomic_thread_fence_release(); - span->size_class = SIZE_CLASS_COUNT + (count_t)idx; - //Increase counter - _memory_counter_increase(&heap->large_counter[idx], &_memory_max_allocation_large[idx]); + _memory_counter_increase(&heap->span_counter[idx], &_memory_max_allocation[idx], span_count); return pointer_offset(span, SPAN_HEADER_SIZE); } @@ -870,152 +1171,97 @@ use_cache: //! Allocate a new heap static heap_t* _memory_allocate_heap(void) { + void* raw_heap; + void* next_raw_heap; + uintptr_t orphan_counter; heap_t* heap; heap_t* next_heap; //Try getting an orphaned heap atomic_thread_fence_acquire(); do { - heap = (heap_t*)atomic_load_ptr(&_memory_orphan_heaps); + raw_heap = atomic_load_ptr(&_memory_orphan_heaps); + heap = (heap_t*)(void*)((uintptr_t)raw_heap & _memory_page_mask); if (!heap) break; next_heap = heap->next_orphan; + orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); + next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & ~_memory_page_mask)); } - while (!atomic_cas_ptr(&_memory_orphan_heaps, next_heap, heap)); + while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap)); - if (heap) { - heap->next_orphan = 0; - return heap; + if (!heap) { + //Map in pages for a new heap + size_t align_offset = 0; + heap = (heap_t*)_memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset); + memset(heap, 0, sizeof(heap_t)); + heap->align_offset = align_offset; + + //Get a new heap ID + do { + heap->id = atomic_incr32(&_memory_heap_id); + if (_memory_heap_lookup(heap->id)) + heap->id = 0; + } while (!heap->id); + + //Link in heap in heap ID map + size_t list_idx = heap->id % HEAP_ARRAY_SIZE; + do { + next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); + heap->next_heap = next_heap; + } while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap)); } - //Map in pages for a new heap - heap = (heap_t*)_memory_map(2); - memset(heap, 0, sizeof(heap_t)); +#if ENABLE_THREAD_CACHE + heap->span_counter[0].cache_limit = MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE; + for (size_t idx = 1; idx < LARGE_CLASS_COUNT; ++idx) + heap->span_counter[idx].cache_limit = MIN_LARGE_SPAN_CACHE_RELEASE + MIN_LARGE_SPAN_CACHE_SIZE; +#endif - //Get a new heap ID - do { - heap->id = atomic_incr32(&_memory_heap_id); - if (_memory_heap_lookup(heap->id)) - heap->id = 0; - } - while (!heap->id); - - //Link in heap in heap ID map - size_t list_idx = heap->id % HEAP_ARRAY_SIZE; - do { - next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); - heap->next_heap = next_heap; - } - while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap)); + //Clean up any deferred operations + _memory_unmap_deferred(heap, 0); + _memory_deallocate_deferred(heap, 0); return heap; } -//! Add a span to a double linked list -static void -_memory_list_add(span_t** head, span_t* span) { - if (*head) { - (*head)->prev_span = span; - span->next_span = *head; - } - else { - span->next_span = 0; - } - *head = span; -} - -//! Remove a span from a double linked list -static void -_memory_list_remove(span_t** head, span_t* span) { - if (*head == span) { - *head = span->next_span; - } - else { - if (span->next_span) - span->next_span->prev_span = span->prev_span; - span->prev_span->next_span = span->next_span; - } -} - -//! Insert span into thread cache, releasing to global cache if overflow -static void -_memory_heap_cache_insert(heap_t* heap, span_t* span, size_t page_count) { -#if MAX_SPAN_CACHE_DIVISOR == 0 - (void)sizeof(heap); - _memory_global_cache_insert(span, 1, page_count); -#else - size_t span_class_idx = _span_class_from_page_count(page_count); - span_t** cache = &heap->span_cache[span_class_idx]; - span->next_span = *cache; - if (*cache) - span->data.list_size = (*cache)->data.list_size + 1; - else - span->data.list_size = 1; - *cache = span; -#if MAX_SPAN_CACHE_DIVISOR > 1 - //Check if cache exceeds limit - if ((span->data.list_size >= (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE)) && - (span->data.list_size > heap->span_counter[span_class_idx].cache_limit)) { - //Release to global cache - count_t list_size = 1; - span_t* next = span->next_span; - span_t* last = span; - while (list_size < MIN_SPAN_CACHE_RELEASE) { - last = next; - next = next->next_span; - ++list_size; - } - next->data.list_size = span->data.list_size - list_size; - last->next_span = 0; //Terminate list - *cache = next; - _memory_global_cache_insert(span, list_size, page_count); -#if ENABLE_STATISTICS - heap->thread_to_global += list_size * page_count * RP_PAGE_SIZE; -#endif - } -#endif -#endif -} - //! Deallocate the given small/medium memory block from the given heap static void _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) { //Check if span is the currently active span in order to operate //on the correct bookkeeping data + assert(SPAN_COUNT(span->flags) == 1); const count_t class_idx = span->size_class; size_class_t* size_class = _memory_size_class + class_idx; int is_active = (heap->active_span[class_idx] == span); span_block_t* block_data = is_active ? - heap->active_block + class_idx : - &span->data.block; - -#if ENABLE_STATISTICS - heap->allocated -= size_class->size; - heap->requested -= *(size_t*)pointer_offset(p, size_class->size - sizeof(size_t)); -#endif + heap->active_block + class_idx : + &span->data.block; //Check if the span will become completely free if (block_data->free_count == ((count_t)size_class->block_count - 1)) { +#if ENABLE_THREAD_CACHE //Track counters - size_t span_class_idx = _span_class_from_page_count(size_class->page_count); - assert(heap->span_counter[span_class_idx].current_allocations > 0); - --heap->span_counter[span_class_idx].current_allocations; + assert(heap->span_counter[0].current_allocations > 0); + if (heap->span_counter[0].current_allocations) + --heap->span_counter[0].current_allocations; +#endif //If it was active, reset counter. Otherwise, if not active, remove from //partial free list if we had a previous free block (guard for classes with only 1 block) if (is_active) block_data->free_count = 0; else if (block_data->free_count > 0) - _memory_list_remove(&heap->size_cache[class_idx], span); + _memory_span_list_doublelink_remove(&heap->size_cache[class_idx], span); - //Add to span cache - _memory_heap_cache_insert(heap, span, size_class->page_count); + //Add to heap span cache + _memory_heap_cache_insert(heap, span); return; } //Check if first free block for this span (previously fully allocated) if (block_data->free_count == 0) { //add to free list and disable autolink - _memory_list_add(&heap->size_cache[class_idx], span); + _memory_span_list_doublelink_add(&heap->size_cache[class_idx], span); block_data->first_autolink = (uint16_t)size_class->block_count; } ++block_data->free_count; @@ -1031,57 +1277,26 @@ _memory_deallocate_to_heap(heap_t* heap, span_t* span, void* p) { //! Deallocate the given large memory block from the given heap static void _memory_deallocate_large_to_heap(heap_t* heap, span_t* span) { - //Check if aliased with 64KiB small/medium spans - if (span->size_class == SIZE_CLASS_COUNT) { - //Track counters - size_t span_class_idx = _span_class_from_page_count(SPAN_MAX_PAGE_COUNT); - --heap->span_counter[span_class_idx].current_allocations; - //Add to span cache - _memory_heap_cache_insert(heap, span, SPAN_MAX_PAGE_COUNT); - return; - } - //Decrease counter - size_t idx = span->size_class - SIZE_CLASS_COUNT; - span_counter_t* counter = heap->large_counter + idx; - assert(counter->current_allocations > 0); - --counter->current_allocations; - -#if MAX_SPAN_CACHE_DIVISOR == 0 - _memory_global_cache_large_insert(span, 1, idx + 1); -#else - //Insert into cache list - span_t** cache = heap->large_cache + idx; - span->next_span = *cache; - if (*cache) - span->data.list_size = (*cache)->data.list_size + 1; - else - span->data.list_size = 1; - *cache = span; -#if MAX_SPAN_CACHE_DIVISOR > 1 - //Check if cache exceeds limit - if ((span->data.list_size >= (MIN_SPAN_CACHE_RELEASE + MIN_SPAN_CACHE_SIZE)) && - (span->data.list_size > counter->cache_limit)) { - //Release to global cache - count_t list_size = 1; - span_t* next = span->next_span; - span_t* last = span; - while (list_size < MIN_SPAN_CACHE_RELEASE) { - last = next; - next = next->next_span; - ++list_size; - } - assert(next->next_span); - next->data.list_size = span->data.list_size - list_size; - last->next_span = 0; //Terminate list - *cache = next; - _memory_global_cache_large_insert(span, list_size, idx + 1); -#if ENABLE_STATISTICS - heap->thread_to_global += list_size * (idx + 1) * SPAN_MAX_SIZE; + size_t idx = (size_t)span->size_class - SIZE_CLASS_COUNT; + size_t span_count = idx + 1; + assert(SPAN_COUNT(span->flags) == span_count); + assert(span->size_class >= SIZE_CLASS_COUNT); + assert(idx < LARGE_CLASS_COUNT); +#if ENABLE_THREAD_CACHE + assert(heap->span_counter[idx].current_allocations > 0); + if (heap->span_counter[idx].current_allocations) + --heap->span_counter[idx].current_allocations; #endif + if (!heap->spans_reserved && (span_count > 1)) { + //Split the span and store remainder as reserved spans + //Must split to a dummy 1-span master since we cannot have master spans as reserved + _memory_set_span_remainder_as_reserved(heap, span, 1); + span_count = 1; } -#endif -#endif + + //Insert into cache list + _memory_heap_cache_insert(heap, span); } //! Process pending deferred cross-thread deallocations @@ -1090,16 +1305,14 @@ _memory_deallocate_deferred(heap_t* heap, size_t size_class) { //Grab the current list of deferred deallocations atomic_thread_fence_acquire(); void* p = atomic_load_ptr(&heap->defer_deallocate); - if (!p) - return 0; - if (!atomic_cas_ptr(&heap->defer_deallocate, 0, p)) + if (!p || !atomic_cas_ptr(&heap->defer_deallocate, 0, p)) return 0; //Keep track if we deallocate in the given size class int got_class = 0; do { void* next = *(void**)p; //Get span and check which type of block - span_t* span = (span_t*)(void*)((uintptr_t)p & SPAN_MASK); + span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); if (span->size_class < SIZE_CLASS_COUNT) { //Small/medium block got_class |= (span->size_class == size_class); @@ -1119,8 +1332,10 @@ _memory_deallocate_deferred(heap_t* heap, size_t size_class) { //! Defer deallocation of the given block to the given heap static void _memory_deallocate_defer(int32_t heap_id, void* p) { - //Get the heap and link in pointer in list of deferred opeations + //Get the heap and link in pointer in list of deferred operations heap_t* heap = _memory_heap_lookup(heap_id); + if (!heap) + return; void* last_ptr; do { last_ptr = atomic_load_ptr(&heap->defer_deallocate); @@ -1131,20 +1346,22 @@ _memory_deallocate_defer(int32_t heap_id, void* p) { //! Allocate a block of the given size static void* _memory_allocate(size_t size) { - if (size <= MEDIUM_SIZE_LIMIT) - return _memory_allocate_from_heap(_memory_thread_heap, size); + if (size <= _memory_medium_size_limit) + return _memory_allocate_from_heap(get_thread_heap(), size); else if (size <= LARGE_SIZE_LIMIT) - return _memory_allocate_large_from_heap(_memory_thread_heap, size); + return _memory_allocate_large_from_heap(get_thread_heap(), size); //Oversized, allocate pages directly size += SPAN_HEADER_SIZE; - size_t num_pages = size / RP_PAGE_SIZE; - if (size % RP_PAGE_SIZE) + size_t num_pages = size >> _memory_page_size_shift; + if (size & (_memory_page_size - 1)) ++num_pages; - span_t* span = (span_t*)_memory_map(num_pages); + size_t align_offset = 0; + span_t* span = (span_t*)_memory_map(num_pages * _memory_page_size, &align_offset); atomic_store32(&span->heap_id, 0); //Store page count in next_span span->next_span = (span_t*)((uintptr_t)num_pages); + span->data.list.align_offset = (uint16_t)align_offset; return pointer_offset(span, SPAN_HEADER_SIZE); } @@ -1156,9 +1373,9 @@ _memory_deallocate(void* p) { return; //Grab the span (always at start of span, using 64KiB alignment) - span_t* span = (span_t*)(void*)((uintptr_t)p & SPAN_MASK); + span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); int32_t heap_id = atomic_load32(&span->heap_id); - heap_t* heap = _memory_thread_heap; + heap_t* heap = get_thread_heap(); //Check if block belongs to this heap or if deallocation should be deferred if (heap_id == heap->id) { if (span->size_class < SIZE_CLASS_COUNT) @@ -1172,7 +1389,7 @@ _memory_deallocate(void* p) { else { //Oversized allocation, page count is stored in next_span size_t num_pages = (size_t)span->next_span; - _memory_unmap(span, num_pages); + _memory_unmap(span, num_pages * _memory_page_size, span->data.list.align_offset, 1); } } @@ -1180,8 +1397,8 @@ _memory_deallocate(void* p) { static void* _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) { if (p) { - //Grab the span (always at start of span, using 64KiB alignment) - span_t* span = (span_t*)(void*)((uintptr_t)p & SPAN_MASK); + //Grab the span using guaranteed span alignment + span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); int32_t heap_id = atomic_load32(&span->heap_id); if (heap_id) { if (span->size_class < SIZE_CLASS_COUNT) { @@ -1195,35 +1412,35 @@ _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) { else { //Large block size_t total_size = size + SPAN_HEADER_SIZE; - size_t num_spans = total_size / SPAN_MAX_SIZE; - if (total_size % SPAN_MAX_SIZE) + size_t num_spans = total_size >> _memory_span_size_shift; + if (total_size & (_memory_span_mask - 1)) ++num_spans; size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) return p; //Still fits and less than half of memory would be freed if (!oldsize) - oldsize = (current_spans * (size_t)SPAN_MAX_SIZE) - SPAN_HEADER_SIZE; + oldsize = (current_spans * _memory_span_size) - SPAN_HEADER_SIZE; } } else { //Oversized block size_t total_size = size + SPAN_HEADER_SIZE; - size_t num_pages = total_size / RP_PAGE_SIZE; - if (total_size % RP_PAGE_SIZE) + size_t num_pages = total_size >> _memory_page_size_shift; + if (total_size & (_memory_page_size - 1)) ++num_pages; //Page count is stored in next_span size_t current_pages = (size_t)span->next_span; if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) return p; //Still fits and less than half of memory would be freed if (!oldsize) - oldsize = (current_pages * (size_t)RP_PAGE_SIZE) - SPAN_HEADER_SIZE; + oldsize = (current_pages * _memory_page_size) - SPAN_HEADER_SIZE; } } //Size is greater than block size, need to allocate a new block and deallocate the old //Avoid hysteresis by overallocating if increase is small (below 37%) size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); - void* block = _memory_allocate(size > lower_bound ? size : lower_bound); + void* block = _memory_allocate((size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size)); if (p) { if (!(flags & RPMALLOC_NO_PRESERVE)) memcpy(block, p, oldsize < size ? oldsize : size); @@ -1236,51 +1453,39 @@ _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) { //! Get the usable size of the given block static size_t _memory_usable_size(void* p) { - //Grab the span (always at start of span, using 64KiB alignment) - span_t* span = (span_t*)(void*)((uintptr_t)p & SPAN_MASK); + //Grab the span using guaranteed span alignment + span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); int32_t heap_id = atomic_load32(&span->heap_id); if (heap_id) { - if (span->size_class < SIZE_CLASS_COUNT) { - //Small/medium block - size_class_t* size_class = _memory_size_class + span->size_class; - return size_class->size; - } + //Small/medium block + if (span->size_class < SIZE_CLASS_COUNT) + return _memory_size_class[span->size_class].size; //Large block size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; - return (current_spans * (size_t)SPAN_MAX_SIZE) - SPAN_HEADER_SIZE; + return (current_spans * _memory_span_size) - SPAN_HEADER_SIZE; } //Oversized block, page count is stored in next_span size_t current_pages = (size_t)span->next_span; - return (current_pages * (size_t)RP_PAGE_SIZE) - SPAN_HEADER_SIZE; + return (current_pages * _memory_page_size) - SPAN_HEADER_SIZE; } //! Adjust and optimize the size class properties for the given class static void _memory_adjust_size_class(size_t iclass) { - //Calculate how many pages are needed for 255 blocks size_t block_size = _memory_size_class[iclass].size; - size_t page_count = (block_size * 255) / RP_PAGE_SIZE; - //Cap to 16 pages (64KiB span granularity) - page_count = (page_count == 0) ? 1 : ((page_count > 16) ? 16 : page_count); - //Merge page counts to span size class granularity - page_count = ((page_count + (SPAN_CLASS_GRANULARITY - 1)) / SPAN_CLASS_GRANULARITY) * SPAN_CLASS_GRANULARITY; - if (page_count > 16) - page_count = 16; - size_t block_count = ((page_count * RP_PAGE_SIZE) - SPAN_HEADER_SIZE) / block_size; - //Store the final configuration - _memory_size_class[iclass].page_count = (uint16_t)page_count; + size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size; + _memory_size_class[iclass].block_count = (uint16_t)block_count; _memory_size_class[iclass].class_idx = (uint16_t)iclass; - + //Check if previous size classes can be merged size_t prevclass = iclass; while (prevclass > 0) { --prevclass; //A class can be merged if number of pages and number of blocks are equal - if ((_memory_size_class[prevclass].page_count == _memory_size_class[iclass].page_count) && - (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)) { + if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count) { memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass])); } else { @@ -1296,11 +1501,11 @@ _memory_adjust_size_class(size_t iclass) { #else # include # include -# include # ifndef MAP_UNINITIALIZED # define MAP_UNINITIALIZED 0 # endif #endif +#include namespace tracy { @@ -1308,21 +1513,81 @@ namespace tracy //! Initialize the allocator and setup global data int rpmalloc_initialize(void) { -#ifdef PLATFORM_WINDOWS - SYSTEM_INFO system_info; - memset(&system_info, 0, sizeof(system_info)); - GetSystemInfo(&system_info); - if (system_info.dwAllocationGranularity < SPAN_ADDRESS_GRANULARITY) - return -1; + memset(&_memory_config, 0, sizeof(rpmalloc_config_t)); + return rpmalloc_initialize_config(0); +} + +int +rpmalloc_initialize_config(const rpmalloc_config_t* config) { + if (config) + memcpy(&_memory_config, config, sizeof(rpmalloc_config_t)); + + int default_mapper = 0; + if (!_memory_config.memory_map || !_memory_config.memory_unmap) { + default_mapper = 1; + _memory_config.memory_map = _memory_map_os; + _memory_config.memory_unmap = _memory_unmap_os; + } + + _memory_page_size = _memory_config.page_size; + if (!_memory_page_size) { +#if PLATFORM_WINDOWS + SYSTEM_INFO system_info; + memset(&system_info, 0, sizeof(system_info)); + GetSystemInfo(&system_info); + _memory_page_size = system_info.dwPageSize; + _memory_map_granularity = system_info.dwAllocationGranularity; #else -#if ARCH_64BIT - atomic_store64(&_memory_addr, 0x1000000000ULL); -#else - atomic_store64(&_memory_addr, 0x1000000ULL); + _memory_page_size = (size_t)sysconf(_SC_PAGESIZE); + _memory_map_granularity = _memory_page_size; #endif + } + + if (_memory_page_size < 512) + _memory_page_size = 512; + if (_memory_page_size > (16 * 1024)) + _memory_page_size = (16 * 1024); + + _memory_page_size_shift = 0; + size_t page_size_bit = _memory_page_size; + while (page_size_bit != 1) { + ++_memory_page_size_shift; + page_size_bit >>= 1; + } + _memory_page_size = ((size_t)1 << _memory_page_size_shift); + _memory_page_mask = ~(uintptr_t)(_memory_page_size - 1); + + size_t span_size = _memory_config.span_size; + if (!span_size) + span_size = (64 * 1024); + if (span_size > (256 * 1024)) + span_size = (256 * 1024); + _memory_span_size = 4096; + _memory_span_size_shift = 12; + while ((_memory_span_size < span_size) || (_memory_span_size < _memory_page_size)) { + _memory_span_size <<= 1; + ++_memory_span_size_shift; + } + _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1); + + _memory_config.page_size = _memory_page_size; + _memory_config.span_size = _memory_span_size; + + if (!_memory_config.span_map_count) + _memory_config.span_map_count = DEFAULT_SPAN_MAP_COUNT; + if (_memory_config.span_size * _memory_config.span_map_count < _memory_config.page_size) + _memory_config.span_map_count = (_memory_config.page_size / _memory_config.span_size); + if (_memory_config.span_map_count > 128) + _memory_config.span_map_count = 128; + +#if defined(__APPLE__) && ENABLE_PRELOAD + if (pthread_key_create(&_memory_thread_heap, 0)) + return -1; #endif atomic_store32(&_memory_heap_id, 0); + atomic_store32(&_memory_orphan_counter, 0); + atomic_store32(&_memory_active_heaps, 0); //Setup all small and medium size classes size_t iclass; @@ -1331,14 +1596,18 @@ rpmalloc_initialize(void) { _memory_size_class[iclass].size = (uint16_t)size; _memory_adjust_size_class(iclass); } + + _memory_medium_size_limit = _memory_span_size - SPAN_HEADER_SIZE; + if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT) + _memory_medium_size_limit = MEDIUM_SIZE_LIMIT; for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); - if (size > MEDIUM_SIZE_LIMIT) - size = MEDIUM_SIZE_LIMIT; + if (size > _memory_medium_size_limit) + size = _memory_medium_size_limit; _memory_size_class[SMALL_CLASS_COUNT + iclass].size = (uint16_t)size; _memory_adjust_size_class(SMALL_CLASS_COUNT + iclass); } - + //Initialize this thread rpmalloc_thread_initialize(); return 0; @@ -1349,271 +1618,290 @@ void rpmalloc_finalize(void) { atomic_thread_fence_acquire(); + rpmalloc_thread_finalize(); + //If you hit this assert, you still have active threads or forgot to finalize some thread(s) + assert(atomic_load32(&_memory_active_heaps) == 0); + //Free all thread caches for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); while (heap) { _memory_deallocate_deferred(heap, 0); - for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) { - const size_t page_count = (iclass + 1) * SPAN_CLASS_GRANULARITY; - span_t* span = heap->span_cache[iclass]; - unsigned int span_count = span ? span->data.list_size : 0; - for (unsigned int ispan = 0; ispan < span_count; ++ispan) { - span_t* next_span = span->next_span; - _memory_unmap(span, page_count); - span = next_span; + //Free span caches (other thread might have deferred after the thread using this heap finalized) +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (heap->span_cache[iclass]) + _memory_unmap_span_list(0, heap->span_cache[iclass]); + } +#endif + heap = heap->next_heap; + } + } + +#if ENABLE_GLOBAL_CACHE + //Free global caches + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) + _memory_cache_finalize(&_memory_span_cache[iclass]); +#endif + + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); + atomic_store_ptr(&_memory_heaps[list_idx], 0); + while (heap) { + if (heap->spans_reserved) { + span_t* span = heap->span_reserve; + span_t* master = heap->span_reserve_master; + uint32_t remains = SPAN_REMAINS(master->flags); + + assert(master != span); + assert(remains >= heap->spans_reserved); + _memory_unmap(span, heap->spans_reserved * _memory_span_size, 0, 0); + _memory_statistics_sub(&_reserved_spans, heap->spans_reserved); + remains = ((uint32_t)heap->spans_reserved >= remains) ? 0 : (remains - (uint32_t)heap->spans_reserved); + if (!remains) { + uint32_t master_span_count = SPAN_COUNT(master->flags); + _memory_statistics_sub(&_reserved_spans, master_span_count); + _memory_unmap(master, master_span_count * _memory_span_size, master->data.list.align_offset, 1); + } + else { + SPAN_SET_REMAINS(master->flags, remains); } } - //Free large spans - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - const size_t span_count = iclass + 1; - span_t* span = heap->large_cache[iclass]; - while (span) { - span_t* next_span = span->next_span; - _memory_unmap(span, span_count * SPAN_MAX_PAGE_COUNT); - span = next_span; - } - } + _memory_unmap_deferred(heap, 0); heap_t* next_heap = heap->next_heap; - _memory_unmap(heap, 2); + _memory_unmap(heap, (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, heap->align_offset, 1); heap = next_heap; } - - atomic_store_ptr(&_memory_heaps[list_idx], 0); } atomic_store_ptr(&_memory_orphan_heaps, 0); - - //Free global caches - for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) { - void* span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]); - size_t cache_count = (uintptr_t)span_ptr & ~SPAN_MASK; - span_t* span = (span_t*)((void*)((uintptr_t)span_ptr & SPAN_MASK)); - while (cache_count) { - span_t* skip_span = span->prev_span; - unsigned int span_count = span->data.list_size; - for (unsigned int ispan = 0; ispan < span_count; ++ispan) { - span_t* next_span = span->next_span; - _memory_unmap(span, (iclass + 1) * SPAN_CLASS_GRANULARITY); - span = next_span; - } - span = skip_span; - cache_count -= span_count; - } - atomic_store_ptr(&_memory_span_cache[iclass], 0); - } - - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - void* span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]); - size_t cache_count = (uintptr_t)span_ptr & ~SPAN_MASK; - span_t* span = (span_t*)((void*)((uintptr_t)span_ptr & SPAN_MASK)); - while (cache_count) { - span_t* skip_span = span->prev_span; - unsigned int span_count = span->data.list_size; - for (unsigned int ispan = 0; ispan < span_count; ++ispan) { - span_t* next_span = span->next_span; - _memory_unmap(span, (iclass + 1) * SPAN_MAX_PAGE_COUNT); - span = next_span; - } - span = skip_span; - cache_count -= span_count; - } - atomic_store_ptr(&_memory_large_cache[iclass], 0); - } - atomic_thread_fence_release(); + +#if ENABLE_STATISTICS + //If you hit these asserts you probably have memory leaks or double frees in your code + assert(!atomic_load32(&_mapped_pages)); + assert(!atomic_load32(&_reserved_spans)); +#endif + +#if defined(__APPLE__) && ENABLE_PRELOAD + pthread_key_delete(_memory_thread_heap); +#endif } //! Initialize thread, assign heap void rpmalloc_thread_initialize(void) { - if (!_memory_thread_heap) { - heap_t* heap = _memory_allocate_heap(); + if (!get_thread_heap()) { + atomic_incr32(&_memory_active_heaps); + heap_t* heap = _memory_allocate_heap(); #if ENABLE_STATISTICS heap->thread_to_global = 0; heap->global_to_thread = 0; #endif - _memory_thread_heap = heap; - atomic_incr32(&_memory_active_heaps); + set_thread_heap(heap); } } //! Finalize thread, orphan heap void rpmalloc_thread_finalize(void) { - heap_t* heap = _memory_thread_heap; + heap_t* heap = get_thread_heap(); if (!heap) return; - atomic_add32(&_memory_active_heaps, -1); - _memory_deallocate_deferred(heap, 0); + _memory_unmap_deferred(heap, 0); //Release thread cache spans back to global cache - for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) { - const size_t page_count = (iclass + 1) * SPAN_CLASS_GRANULARITY; +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { span_t* span = heap->span_cache[iclass]; +#if ENABLE_GLOBAL_CACHE + const size_t span_count = iclass + 1; while (span) { - if (span->data.list_size > MIN_SPAN_CACHE_RELEASE) { - count_t list_size = 1; - span_t* next = span->next_span; - span_t* last = span; - while (list_size < MIN_SPAN_CACHE_RELEASE) { - last = next; - next = next->next_span; - ++list_size; - } - last->next_span = 0; //Terminate list - next->data.list_size = span->data.list_size - list_size; - _memory_global_cache_insert(span, list_size, page_count); - span = next; - } - else { - _memory_global_cache_insert(span, span->data.list_size, page_count); - span = 0; - } + assert(SPAN_COUNT(span->flags) == span_count); + span_t* next = _memory_span_list_split(span, !iclass ? MIN_SPAN_CACHE_RELEASE : (MIN_LARGE_SPAN_CACHE_RELEASE / span_count)); + _memory_global_cache_insert(0, span); + span = next; } +#else + if (span) + _memory_unmap_span_list(heap, span); +#endif heap->span_cache[iclass] = 0; } - - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - const size_t span_count = iclass + 1; - span_t* span = heap->large_cache[iclass]; - while (span) { - if (span->data.list_size > MIN_SPAN_CACHE_RELEASE) { - count_t list_size = 1; - span_t* next = span->next_span; - span_t* last = span; - while (list_size < MIN_SPAN_CACHE_RELEASE) { - last = next; - next = next->next_span; - ++list_size; - } - last->next_span = 0; //Terminate list - next->data.list_size = span->data.list_size - list_size; - _memory_global_cache_large_insert(span, list_size, span_count); - span = next; - } - else { - _memory_global_cache_large_insert(span, span->data.list_size, span_count); - span = 0; - } - } - heap->large_cache[iclass] = 0; - } - - //Reset allocation counters - memset(heap->span_counter, 0, sizeof(heap->span_counter)); - memset(heap->large_counter, 0, sizeof(heap->large_counter)); -#if ENABLE_STATISTICS - heap->requested = 0; - heap->allocated = 0; - heap->thread_to_global = 0; - heap->global_to_thread = 0; #endif //Orphan the heap + void* raw_heap; + uintptr_t orphan_counter; heap_t* last_heap; do { last_heap = (heap_t*)atomic_load_ptr(&_memory_orphan_heaps); - heap->next_orphan = last_heap; + heap->next_orphan = (heap_t*)(void*)((uintptr_t)last_heap & _memory_page_mask); + orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); + raw_heap = (void*)((uintptr_t)heap | (orphan_counter & ~_memory_page_mask)); } - while (!atomic_cas_ptr(&_memory_orphan_heaps, heap, last_heap)); - - _memory_thread_heap = 0; + while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap)); + + set_thread_heap(0); + atomic_add32(&_memory_active_heaps, -1); } int rpmalloc_is_thread_initialized(void) { - return (_memory_thread_heap != 0) ? 1 : 0; + return (get_thread_heap() != 0) ? 1 : 0; +} + +const rpmalloc_config_t* +rpmalloc_config(void) { + return &_memory_config; } //! Map new pages to virtual memory static void* -_memory_map(size_t page_count) { - size_t total_size = page_count * RP_PAGE_SIZE; - void* pages_ptr = 0; +_memory_map_os(size_t size, size_t* offset) { + //Either size is a heap (a single page) or a (multiple) span - we only need to align spans + size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0; -#if ENABLE_STATISTICS - atomic_add32(&_mapped_pages, (int32_t)page_count); - atomic_add32(&_mapped_total, (int32_t)page_count); -#endif - -#ifdef PLATFORM_WINDOWS - pages_ptr = VirtualAlloc(0, total_size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); -#else - //mmap lacks a way to set 64KiB address granularity, implement it locally - intptr_t incr = (intptr_t)total_size / (intptr_t)SPAN_ADDRESS_GRANULARITY; - if (total_size % SPAN_ADDRESS_GRANULARITY) - ++incr; - do { - void* base_addr = (void*)(uintptr_t)atomic_exchange_and_add64(&_memory_addr, - (incr * (intptr_t)SPAN_ADDRESS_GRANULARITY)); - pages_ptr = mmap(base_addr, total_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED, -1, 0); - if (pages_ptr != MAP_FAILED) { - if (pages_ptr != base_addr) { - void* new_base = (void*)((uintptr_t)pages_ptr & SPAN_MASK); - atomic_store64(&_memory_addr, (int64_t)((uintptr_t)new_base) + - ((incr + 1) * (intptr_t)SPAN_ADDRESS_GRANULARITY)); - atomic_thread_fence_release(); - } - if (!((uintptr_t)pages_ptr & ~SPAN_MASK)) - break; - munmap(pages_ptr, total_size); - } +#if PLATFORM_WINDOWS + //Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed" + void* ptr = VirtualAlloc(0, size + padding, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + if (!ptr) { + assert("Failed to map virtual memory block" == 0); + return 0; + } +#else + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED, -1, 0); + if ((ptr == MAP_FAILED) || !ptr) { + assert("Failed to map virtual memory block" == 0); + return 0; } - while (1); #endif - return pages_ptr; + if (padding) { + size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask); +#if PLATFORM_POSIX + //Unmap the last unused pages, for Windows this is done with the final VirtualFree with MEM_RELEASE call + size_t remains = padding - final_padding; + if (remains) + munmap(pointer_offset(ptr, final_padding + size), remains); +#endif + ptr = pointer_offset(ptr, final_padding); + assert(final_padding <= _memory_span_size); + assert(!(final_padding & 5)); + assert(!((uintptr_t)ptr & ~_memory_span_mask)); + *offset = final_padding >> 3; + assert(*offset < 65536); + } + + return ptr; } //! Unmap pages from virtual memory static void -_memory_unmap(void* ptr, size_t page_count) { -#if ENABLE_STATISTICS - atomic_add32(&_mapped_pages, -(int32_t)page_count); - atomic_add32(&_unmapped_total, (int32_t)page_count); +_memory_unmap_os(void* address, size_t size, size_t offset, int release) { + assert(release || (offset == 0)); + if (release && offset) { + offset <<= 3; +#if PLATFORM_POSIX + size += offset; #endif - -#ifdef PLATFORM_WINDOWS - VirtualFree(ptr, 0, MEM_RELEASE); + address = pointer_offset(address, -(int32_t)offset); + } +#if PLATFORM_WINDOWS + if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) { + DWORD err = GetLastError(); + assert("Failed to unmap virtual memory block" == 0); + } #else - munmap(ptr, RP_PAGE_SIZE * page_count); + MEMORY_UNUSED(release); + if (munmap(address, size)) { + assert("Failed to unmap virtual memory block" == 0); + } #endif } -static FORCEINLINE int -atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { -#ifdef _MSC_VER -# if ARCH_64BIT - return (_InterlockedCompareExchange64((volatile long long*)&dst->nonatomic, - (long long)val, (long long)ref) == (long long)ref) ? 1 : 0; -# else - return (_InterlockedCompareExchange((volatile long*)&dst->nonatomic, - (long)val, (long)ref) == (long)ref) ? 1 : 0; -# endif -#else - return __sync_bool_compare_and_swap(&dst->nonatomic, ref, val); -#endif -} - -//! Yield the thread remaining timeslice +#if ENABLE_GUARDS static void -thread_yield(void) { -#ifdef PLATFORM_WINDOWS - YieldProcessor(); -#else - sched_yield(); -#endif +_memory_guard_validate(void* p) { + if (!p) + return; + void* block_start; + size_t block_size = _memory_usable_size(p); + span_t* span = (void*)((uintptr_t)p & _memory_span_mask); + int32_t heap_id = atomic_load32(&span->heap_id); + if (heap_id) { + if (span->size_class < SIZE_CLASS_COUNT) { + void* span_blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + size_class_t* size_class = _memory_size_class + span->size_class; + count_t block_offset = (count_t)pointer_diff(p, span_blocks_start); + count_t block_idx = block_offset / (count_t)size_class->size; + block_start = pointer_offset(span_blocks_start, block_idx * size_class->size); + } + else { + block_start = pointer_offset(span, SPAN_HEADER_SIZE); + } + } + else { + block_start = pointer_offset(span, SPAN_HEADER_SIZE); + } + uint32_t* deadzone = block_start; + //If these asserts fire, you have written to memory before the block start + for (int i = 0; i < 8; ++i) { + if (deadzone[i] != MAGIC_GUARD) { + if (_memory_config.memory_overwrite) + _memory_config.memory_overwrite(p); + else + assert("Memory overwrite before block start" == 0); + return; + } + deadzone[i] = 0; + } + deadzone = (uint32_t*)pointer_offset(block_start, block_size - 32); + //If these asserts fire, you have written to memory after the block end + for (int i = 0; i < 8; ++i) { + if (deadzone[i] != MAGIC_GUARD) { + if (_memory_config.memory_overwrite) + _memory_config.memory_overwrite(p); + else + assert("Memory overwrite after block end" == 0); + return; + } + deadzone[i] = 0; + } } +#else +#define _memory_guard_validate(block) +#endif + +#if ENABLE_GUARDS +static void +_memory_guard_block(void* block) { + if (block) { + size_t block_size = _memory_usable_size(block); + uint32_t* deadzone = block; + deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = + deadzone[4] = deadzone[5] = deadzone[6] = deadzone[7] = MAGIC_GUARD; + deadzone = (uint32_t*)pointer_offset(block, block_size - 32); + deadzone[0] = deadzone[1] = deadzone[2] = deadzone[3] = + deadzone[4] = deadzone[5] = deadzone[6] = deadzone[7] = MAGIC_GUARD; + } +} +#define _memory_guard_pre_alloc(size) size += 64 +#define _memory_guard_pre_realloc(block, size) block = pointer_offset(block, -32); size += 64 +#define _memory_guard_post_alloc(block, size) _memory_guard_block(block); block = pointer_offset(block, 32); size -= 64 +#else +#define _memory_guard_pre_alloc(size) +#define _memory_guard_pre_realloc(block, size) +#define _memory_guard_post_alloc(block, size) +#endif // Extern interface -void* +RPMALLOC_RESTRICT void* rpmalloc(size_t size) { #if ENABLE_VALIDATE_ARGS if (size >= MAX_ALLOC_SIZE) { @@ -1621,19 +1909,23 @@ rpmalloc(size_t size) { return 0; } #endif - return _memory_allocate(size); + _memory_guard_pre_alloc(size); + void* block = _memory_allocate(size); + _memory_guard_post_alloc(block, size); + return block; } void rpfree(void* ptr) { + _memory_guard_validate(ptr); _memory_deallocate(ptr); } -void* +RPMALLOC_RESTRICT void* rpcalloc(size_t num, size_t size) { size_t total; #if ENABLE_VALIDATE_ARGS -#ifdef PLATFORM_WINDOWS +#if PLATFORM_WINDOWS int err = SizeTMult(num, size, &total); if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { errno = EINVAL; @@ -1649,9 +1941,11 @@ rpcalloc(size_t num, size_t size) { #else total = num * size; #endif - void* ptr = _memory_allocate(total); - memset(ptr, 0, total); - return ptr; + _memory_guard_pre_alloc(total); + void* block = _memory_allocate(total); + _memory_guard_post_alloc(block, total); + memset(block, 0, total); + return block; } void* @@ -1662,30 +1956,45 @@ rprealloc(void* ptr, size_t size) { return ptr; } #endif - return _memory_reallocate(ptr, size, 0, 0); + _memory_guard_validate(ptr); + _memory_guard_pre_realloc(ptr, size); + void* block = _memory_reallocate(ptr, size, 0, 0); + _memory_guard_post_alloc(block, size); + return block; } void* rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags) { #if ENABLE_VALIDATE_ARGS - if (size + alignment < size) { + if ((size + alignment < size) || (alignment > _memory_page_size)) { errno = EINVAL; return 0; } #endif - //TODO: If alignment > 16, we need to copy to new aligned position - (void)sizeof(alignment); - return _memory_reallocate(ptr, size, oldsize, flags); + void* block; + if (alignment > 32) { + block = rpaligned_alloc(alignment, size); + if (!(flags & RPMALLOC_NO_PRESERVE)) + memcpy(block, ptr, oldsize < size ? oldsize : size); + rpfree(ptr); + } + else { + _memory_guard_validate(ptr); + _memory_guard_pre_realloc(ptr, size); + block = _memory_reallocate(ptr, size, oldsize, flags); + _memory_guard_post_alloc(block, size); + } + return block; } -void* +RPMALLOC_RESTRICT void* rpaligned_alloc(size_t alignment, size_t size) { - if (alignment <= 16) + if (alignment <= 32) return rpmalloc(size); #if ENABLE_VALIDATE_ARGS - if (size + alignment < size) { + if ((size + alignment < size) || (alignment > _memory_page_size)) { errno = EINVAL; return 0; } @@ -1697,7 +2006,7 @@ rpaligned_alloc(size_t alignment, size_t size) { return ptr; } -void* +RPMALLOC_RESTRICT void* rpmemalign(size_t alignment, size_t size) { return rpaligned_alloc(alignment, size); } @@ -1713,26 +2022,31 @@ rpposix_memalign(void **memptr, size_t alignment, size_t size) { size_t rpmalloc_usable_size(void* ptr) { - return ptr ? _memory_usable_size(ptr) : 0; + size_t size = 0; + if (ptr) { + size = _memory_usable_size(ptr); +#if ENABLE_GUARDS + size -= 64; +#endif + } + return size; } void rpmalloc_thread_collect(void) { - _memory_deallocate_deferred(_memory_thread_heap, 0); + heap_t* heap = get_thread_heap(); + _memory_unmap_deferred(heap, 0); + _memory_deallocate_deferred(0, 0); } void rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) { memset(stats, 0, sizeof(rpmalloc_thread_statistics_t)); - heap_t* heap = _memory_thread_heap; -#if ENABLE_STATISTICS - stats->allocated = heap->allocated; - stats->requested = heap->requested; -#endif + heap_t* heap = get_thread_heap(); void* p = atomic_load_ptr(&heap->defer_deallocate); while (p) { void* next = *(void**)p; - span_t* span = (span_t*)(void*)((uintptr_t)p & SPAN_MASK); + span_t* span = (span_t*)(void*)((uintptr_t)p & _memory_span_mask); stats->deferred += _memory_size_class[span->size_class].size; p = next; } @@ -1748,40 +2062,27 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) { } } - for (size_t isize = 0; isize < SPAN_CLASS_COUNT; ++isize) { - if (heap->span_cache[isize]) - stats->spancache = (size_t)heap->span_cache[isize]->data.list_size * (isize + 1) * SPAN_CLASS_GRANULARITY * RP_PAGE_SIZE; +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (heap->span_cache[iclass]) + stats->spancache = (size_t)heap->span_cache[iclass]->data.list.size * (iclass + 1) * _memory_span_size; } +#endif } void rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) { memset(stats, 0, sizeof(rpmalloc_global_statistics_t)); #if ENABLE_STATISTICS - stats->mapped = (size_t)atomic_load32(&_mapped_pages) * RP_PAGE_SIZE; - stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * RP_PAGE_SIZE; - stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * RP_PAGE_SIZE; + stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; + stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size; + stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; #endif - for (size_t iclass = 0; iclass < SPAN_CLASS_COUNT; ++iclass) { - void* global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]); - while (global_span_ptr == SPAN_LIST_LOCK_TOKEN) { - thread_yield(); - global_span_ptr = atomic_load_ptr(&_memory_span_cache[iclass]); - } - uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~SPAN_MASK; - size_t list_bytes = global_span_count * (iclass + 1) * SPAN_CLASS_GRANULARITY * RP_PAGE_SIZE; - stats->cached += list_bytes; - } +#if ENABLE_GLOBAL_CACHE for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - void* global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]); - while (global_span_ptr == SPAN_LIST_LOCK_TOKEN) { - thread_yield(); - global_span_ptr = atomic_load_ptr(&_memory_large_cache[iclass]); - } - uintptr_t global_span_count = (uintptr_t)global_span_ptr & ~SPAN_MASK; - size_t list_bytes = global_span_count * (iclass + 1) * SPAN_MAX_PAGE_COUNT * RP_PAGE_SIZE; - stats->cached_large += list_bytes; + stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size; } +#endif } } diff --git a/client/tracy_rpmalloc.hpp b/client/tracy_rpmalloc.hpp index daf7e01b..b6687e49 100644 --- a/client/tracy_rpmalloc.hpp +++ b/client/tracy_rpmalloc.hpp @@ -18,13 +18,16 @@ namespace tracy #if defined(__clang__) || defined(__GNUC__) # define RPMALLOC_ATTRIBUTE __attribute__((__malloc__)) -# define RPMALLOC_CALL +# define RPMALLOC_RESTRICT +# define RPMALLOC_CDECL #elif defined(_MSC_VER) # define RPMALLOC_ATTRIBUTE -# define RPMALLOC_CALL __declspec(restrict) +# define RPMALLOC_RESTRICT __declspec(restrict) +# define RPMALLOC_CDECL __cdecl #else # define RPMALLOC_ATTRIBUTE -# define RPMALLOC_CALL +# define RPMALLOC_RESTRICT +# define RPMALLOC_CDECL #endif //! Flag to rpaligned_realloc to not preserve content in reallocation @@ -35,8 +38,6 @@ typedef struct rpmalloc_global_statistics_t { size_t mapped; //! Current amount of memory in global caches for small and medium sizes (<64KiB) size_t cached; - //! Curren amount of memory in global caches for large sizes (>=64KiB) - size_t cached_large; //! Total amount of memory mapped (only if ENABLE_STATISTICS=1) size_t mapped_total; //! Total amount of memory unmapped (only if ENABLE_STATISTICS=1) @@ -44,10 +45,6 @@ typedef struct rpmalloc_global_statistics_t { } rpmalloc_global_statistics_t; typedef struct rpmalloc_thread_statistics_t { - //! Amount of memory currently requested in allocations (only if ENABLE_STATISTICS=1) - size_t requested; - //! Amount of memory actually allocated in memory blocks (only if ENABLE_STATISTICS=1) - size_t allocated; //! Current number of bytes available for allocation from active spans size_t active; //! Current number of bytes available in thread size class caches @@ -62,9 +59,47 @@ typedef struct rpmalloc_thread_statistics_t { size_t global_to_thread; } rpmalloc_thread_statistics_t; +typedef struct rpmalloc_config_t { + //! Map memory pages for the given number of bytes. The returned address MUST be + // aligned to the rpmalloc span size, which will always be a power of two. + // Optionally the function can store an alignment offset in the offset variable + // in case it performs alignment and the returned pointer is offset from the + // actual start of the memory region due to this alignment. The alignment offset + // will be passed to the memory unmap function. The alignment offset MUST NOT be + // larger than 65535 (storable in an uint16_t), if it is you must use natural + // alignment to shift it into 16 bits. + void* (*memory_map)(size_t size, size_t* offset); + //! Unmap the memory pages starting at address and spanning the given number of bytes. + // If release is set to 1, the unmap is for an entire span range as returned by + // a previous call to memory_map and that the entire range should be released. + // If release is set to 0, the unmap is a partial decommit of a subset of the mapped + // memory range. + void (*memory_unmap)(void* address, size_t size, size_t offset, int release); + //! Size of memory pages. The page size MUST be a power of two in [512,16384] range + // (2^9 to 2^14) unless 0 - set to 0 to use system page size. All memory mapping + // requests to memory_map will be made with size set to a multiple of the page size. + size_t page_size; + //! Size of a span of memory pages. MUST be a multiple of page size, and in [4096,262144] + // range (unless 0 - set to 0 to use the default span size). + size_t span_size; + //! Number of spans to map at each request to map new virtual memory blocks. This can + // be used to minimize the system call overhead at the cost of virtual memory address + // space. The extra mapped pages will not be written until actually used, so physical + // committed memory should not be affected in the default implementation. + size_t span_map_count; + //! Debug callback if memory guards are enabled. Called if a memory overwrite is detected + void (*memory_overwrite)(void* address); +} rpmalloc_config_t; + extern int rpmalloc_initialize(void); +extern int +rpmalloc_initialize_config(const rpmalloc_config_t* config); + +extern const rpmalloc_config_t* +rpmalloc_config(void); + extern void rpmalloc_finalize(void); @@ -86,13 +121,13 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats); extern void rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats); -extern RPMALLOC_CALL void* +extern RPMALLOC_RESTRICT void* rpmalloc(size_t size) RPMALLOC_ATTRIBUTE; extern void rpfree(void* ptr); -extern RPMALLOC_CALL void* +extern RPMALLOC_RESTRICT void* rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIBUTE; extern void* @@ -101,10 +136,10 @@ rprealloc(void* ptr, size_t size); extern void* rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags); -extern RPMALLOC_CALL void* +extern RPMALLOC_RESTRICT void* rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIBUTE; -extern RPMALLOC_CALL void* +extern RPMALLOC_RESTRICT void* rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIBUTE; extern int