diff --git a/TracyC.h b/TracyC.h index 3e0c3f69..ea247399 100644 --- a/TracyC.h +++ b/TracyC.h @@ -101,7 +101,6 @@ struct ___tracy_c_zone_context // This struct, as visible to user, is immutable, so treat it as if const was declared here. typedef /*const*/ struct ___tracy_c_zone_context TracyCZoneCtx; -TRACY_API void ___tracy_init_thread(void); TRACY_API uint64_t ___tracy_alloc_srcloc( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz ); TRACY_API uint64_t ___tracy_alloc_srcloc_name( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz ); diff --git a/TracyD3D11.hpp b/TracyD3D11.hpp index da2c5004..8552f36f 100644 --- a/TracyD3D11.hpp +++ b/TracyD3D11.hpp @@ -389,7 +389,6 @@ private: static inline D3D11Ctx* CreateD3D11Context( ID3D11Device* device, ID3D11DeviceContext* devicectx ) { - InitRPMallocThread(); auto ctx = (D3D11Ctx*)tracy_malloc( sizeof( D3D11Ctx ) ); new(ctx) D3D11Ctx( device, devicectx ); return ctx; diff --git a/TracyD3D12.hpp b/TracyD3D12.hpp index 2de349f3..1c1d97dc 100644 --- a/TracyD3D12.hpp +++ b/TracyD3D12.hpp @@ -451,8 +451,6 @@ namespace tracy static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue) { - InitRPMallocThread(); - auto* ctx = static_cast(tracy_malloc(sizeof(D3D12QueueCtx))); new (ctx) D3D12QueueCtx{ device, queue }; diff --git a/TracyOpenCL.hpp b/TracyOpenCL.hpp index 393ada9a..1fd3e741 100644 --- a/TracyOpenCL.hpp +++ b/TracyOpenCL.hpp @@ -286,7 +286,6 @@ namespace tracy { static inline OpenCLCtx* CreateCLContext(cl_context context, cl_device_id device) { - InitRPMallocThread(); auto ctx = (OpenCLCtx*)tracy_malloc(sizeof(OpenCLCtx)); new (ctx) OpenCLCtx(context, device); return ctx; diff --git a/TracyOpenGL.hpp b/TracyOpenGL.hpp index 8a487a33..2dba3aa8 100644 --- a/TracyOpenGL.hpp +++ b/TracyOpenGL.hpp @@ -53,7 +53,7 @@ public: # define glQueryCounter glQueryCounterEXT #endif -#define TracyGpuContext tracy::InitRPMallocThread(); tracy::GetGpuCtx().ptr = (tracy::GpuCtx*)tracy::tracy_malloc( sizeof( tracy::GpuCtx ) ); new(tracy::GetGpuCtx().ptr) tracy::GpuCtx; +#define TracyGpuContext tracy::GetGpuCtx().ptr = (tracy::GpuCtx*)tracy::tracy_malloc( sizeof( tracy::GpuCtx ) ); new(tracy::GetGpuCtx().ptr) tracy::GpuCtx; #define TracyGpuContextName( name, size ) tracy::GetGpuCtx().ptr->Name( name, size ); #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK # define TracyGpuNamedZone( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active ); diff --git a/TracyVulkan.hpp b/TracyVulkan.hpp index 6885fdba..7c1dabb3 100644 --- a/TracyVulkan.hpp +++ b/TracyVulkan.hpp @@ -456,7 +456,6 @@ private: static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct ) { - InitRPMallocThread(); auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) ); new(ctx) VkCtx( physdev, device, queue, cmdbuf, gpdctd, gct ); return ctx; diff --git a/client/TracyProfiler.cpp b/client/TracyProfiler.cpp index 3de17c07..128d42d6 100644 --- a/client/TracyProfiler.cpp +++ b/client/TracyProfiler.cpp @@ -119,45 +119,6 @@ extern "C" typedef BOOL (WINAPI *t_GetLogicalProcessorInformationEx)( LOGICAL_PR namespace tracy { -namespace -{ -# if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA - BOOL CALLBACK InitOnceCallback( PINIT_ONCE /*initOnce*/, PVOID /*Parameter*/, PVOID* /*Context*/) - { - rpmalloc_initialize(); - return TRUE; - } - INIT_ONCE InitOnce = INIT_ONCE_STATIC_INIT; -# elif defined __linux__ - void InitOnceCallback() - { - rpmalloc_initialize(); - } - pthread_once_t once_control = PTHREAD_ONCE_INIT; -# else - void InitOnceCallback() - { - rpmalloc_initialize(); - } - std::once_flag once_flag; -# endif -} - -struct RPMallocInit -{ - RPMallocInit() - { -# if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA - InitOnceExecuteOnce( &InitOnce, InitOnceCallback, nullptr, nullptr ); -# elif defined __linux__ - pthread_once( &once_control, InitOnceCallback ); -# else - std::call_once( once_flag, InitOnceCallback ); -# endif - rpmalloc_thread_initialize(); - } -}; - #ifndef TRACY_DELAYED_INIT struct InitTimeWrapper @@ -965,12 +926,6 @@ TRACY_API int64_t GetFrequencyQpc() #ifdef TRACY_DELAYED_INIT struct ThreadNameData; TRACY_API moodycamel::ConcurrentQueue& GetQueue(); -TRACY_API void InitRPMallocThread(); - -void InitRPMallocThread() -{ - RPMallocInit rpinit; -} struct ProfilerData { @@ -992,7 +947,6 @@ struct ProducerWrapper struct ProfilerThreadData { ProfilerThreadData( ProfilerData& data ) : token( data ), gpuCtx( { nullptr } ) {} - RPMallocInit rpmalloc_init; ProducerWrapper token; GpuCtxWrapper gpuCtx; # ifdef TRACY_ON_DEMAND @@ -1004,7 +958,6 @@ struct ProfilerThreadData ProfilerData* s_profilerData = nullptr; TRACY_API void StartupProfiler() { - RPMallocInit init; s_profilerData = (ProfilerData*)tracy_malloc( sizeof( ProfilerData ) ); new (s_profilerData) ProfilerData(); s_profilerData->profiler.SpawnWorkerThreads(); @@ -1022,6 +975,8 @@ TRACY_API void ShutdownProfiler() rpmalloc_finalize(); } # else +std::atomic RpInitDone { 0 }; +std::atomic RpInitLock { 0 }; static std::atomic profilerDataLock { 0 }; static std::atomic profilerData { nullptr }; @@ -1035,7 +990,6 @@ static ProfilerData& GetProfilerData() ptr = profilerData.load( std::memory_order_acquire ); if( !ptr ) { - RPMallocInit init; ptr = (ProfilerData*)tracy_malloc( sizeof( ProfilerData ) ); new (ptr) ProfilerData(); profilerData.store( ptr, std::memory_order_release ); @@ -1072,7 +1026,6 @@ public: void* p = pthread_getspecific(m_key); if (!p) { - RPMallocInit init; p = (ProfilerThreadData*)tracy_malloc( sizeof( ProfilerThreadData ) ); new (p) ProfilerThreadData(GetProfilerData()); pthread_setspecific(m_key, p); @@ -1124,18 +1077,12 @@ namespace # endif #else -TRACY_API void InitRPMallocThread() -{ - rpmalloc_thread_initialize(); -} // MSVC static initialization order solution. gcc/clang uses init_order() to avoid all this. // 1a. But s_queue is needed for initialization of variables in point 2. extern moodycamel::ConcurrentQueue s_queue; -thread_local RPMallocInit init_order(106) s_rpmalloc_thread_init; - // 2. If these variables would be in the .CRT$XCB section, they would be initialized only in main thread. thread_local moodycamel::ProducerToken init_order(107) s_token_detail( s_queue ); thread_local ProducerWrapper init_order(108) s_token { s_queue.get_explicit_producer( s_token_detail ) }; @@ -1148,7 +1095,8 @@ thread_local ThreadHandleWrapper init_order(104) s_threadHandle { detail::GetThr # endif static InitTimeWrapper init_order(101) s_initTime { SetupHwTimer() }; -static RPMallocInit init_order(102) s_rpmalloc_init; +std::atomic init_order(102) RpInitDone( 0 ); +std::atomic init_order(102) RpInitLock( 0 ); moodycamel::ConcurrentQueue init_order(103) s_queue( QueuePrealloc ); std::atomic init_order(104) s_lockCounter( 0 ); std::atomic init_order(104) s_gpuCtxCounter( 0 ); @@ -3614,19 +3562,6 @@ TRACY_API uint64_t ___tracy_alloc_srcloc_name( uint32_t line, const char* source return tracy::Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); } -// thread_locals are not initialized on thread creation. At least on GNU/Linux. Instead they are -// initialized on their first ODR-use. This means that the allocator is not automagically -// initialized every time a thread is created. As thus, expose to the C API users a simple API to -// call every time they create a thread. Here we can then put all sorts of per-thread -// initialization. -TRACY_API void ___tracy_init_thread(void) { -#ifdef TRACY_DELAYED_INIT - (void)tracy::GetProfilerThreadData(); -#else - (void)tracy::s_rpmalloc_thread_init; -#endif -} - #ifdef __cplusplus } #endif diff --git a/client/TracyProfiler.hpp b/client/TracyProfiler.hpp index 230835ca..99c62fd7 100644 --- a/client/TracyProfiler.hpp +++ b/client/TracyProfiler.hpp @@ -63,7 +63,6 @@ TRACY_API std::atomic& GetLockCounter(); TRACY_API std::atomic& GetGpuCtxCounter(); TRACY_API GpuCtxWrapper& GetGpuCtx(); TRACY_API uint64_t GetThreadHandle(); -TRACY_API void InitRPMallocThread(); TRACY_API bool ProfilerAvailable(); TRACY_API int64_t GetFrequencyQpc(); @@ -295,7 +294,6 @@ public: #endif if( callstack != 0 ) { - InitRPMallocThread(); tracy::GetProfiler().SendCallstack( callstack ); } @@ -315,7 +313,6 @@ public: #endif if( callstack != 0 ) { - InitRPMallocThread(); tracy::GetProfiler().SendCallstack( callstack ); } @@ -333,7 +330,6 @@ public: #endif if( callstack != 0 ) { - InitRPMallocThread(); tracy::GetProfiler().SendCallstack( callstack ); } @@ -356,7 +352,6 @@ public: #endif if( callstack != 0 ) { - InitRPMallocThread(); tracy::GetProfiler().SendCallstack( callstack ); } @@ -372,7 +367,6 @@ public: static tracy_force_inline void MessageAppInfo( const char* txt, size_t size ) { assert( size < std::numeric_limits::max() ); - InitRPMallocThread(); auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, txt, size ); TracyLfqPrepare( QueueType::MessageAppInfo ); @@ -423,7 +417,6 @@ public: # endif const auto thread = GetThreadHandle(); - InitRPMallocThread(); auto callstack = Callstack( depth ); profiler.m_serialLock.lock(); @@ -445,7 +438,6 @@ public: # endif const auto thread = GetThreadHandle(); - InitRPMallocThread(); auto callstack = Callstack( depth ); profiler.m_serialLock.lock(); @@ -495,7 +487,6 @@ public: # endif const auto thread = GetThreadHandle(); - InitRPMallocThread(); auto callstack = Callstack( depth ); profiler.m_serialLock.lock(); @@ -518,7 +509,6 @@ public: # endif const auto thread = GetThreadHandle(); - InitRPMallocThread(); auto callstack = Callstack( depth ); profiler.m_serialLock.lock(); diff --git a/common/TracyAlloc.hpp b/common/TracyAlloc.hpp index a3cbec05..1981c09a 100644 --- a/common/TracyAlloc.hpp +++ b/common/TracyAlloc.hpp @@ -4,15 +4,47 @@ #include #ifdef TRACY_ENABLE +# include +# include "TracyForceInline.hpp" +# include "TracyYield.hpp" # include "../client/tracy_rpmalloc.hpp" #endif namespace tracy { +#ifdef TRACY_ENABLE +extern std::atomic RpInitDone; +extern std::atomic RpInitLock; + +namespace +{ +static inline void InitRpmallocPlumbing() +{ + int expected = 0; + while( !RpInitLock.compare_exchange_weak( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; YieldThread(); } + const auto done = RpInitDone.load( std::memory_order_acquire ); + if( !done ) + { + rpmalloc_initialize(); + RpInitDone.store( 1, std::memory_order_release ); + } + RpInitLock.store( 0, std::memory_order_release ); +} + +static tracy_force_inline void InitRpmalloc() +{ + const auto done = RpInitDone.load( std::memory_order_acquire ); + if( !done ) InitRpmallocPlumbing(); + rpmalloc_thread_initialize(); +} +} +#endif + static inline void* tracy_malloc( size_t size ) { #ifdef TRACY_ENABLE + InitRpmalloc(); return rpmalloc( size ); #else return malloc( size ); @@ -22,6 +54,7 @@ static inline void* tracy_malloc( size_t size ) static inline void tracy_free( void* ptr ) { #ifdef TRACY_ENABLE + InitRpmalloc(); rpfree( ptr ); #else free( ptr ); @@ -31,6 +64,7 @@ static inline void tracy_free( void* ptr ) static inline void* tracy_realloc( void* ptr, size_t size ) { #ifdef TRACY_ENABLE + InitRpmalloc(); return rprealloc( ptr, size ); #else return realloc( ptr, size ); diff --git a/common/TracySystem.cpp b/common/TracySystem.cpp index 3cda186c..03078fca 100644 --- a/common/TracySystem.cpp +++ b/common/TracySystem.cpp @@ -96,7 +96,6 @@ struct ThreadNameData ThreadNameData* next; }; std::atomic& GetThreadNameData(); -TRACY_API void InitRPMallocThread(); #endif #ifdef _MSC_VER @@ -161,7 +160,6 @@ TRACY_API void SetThreadName( const char* name ) #endif #ifdef TRACY_ENABLE { - InitRPMallocThread(); const auto sz = strlen( name ); char* buf = (char*)tracy_malloc( sz+1 ); memcpy( buf, name, sz );