From 4c94b3eff7c212e431a117c6c10ee60bb2a8e875 Mon Sep 17 00:00:00 2001 From: Tiago Rodrigues Date: Fri, 10 Nov 2023 17:00:39 -0500 Subject: [PATCH 1/7] Add support to use libunwind for backtrace capturing on linux platforms (which is ~ 4x faster than execinfo) --- public/client/TracyCallstack.hpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/public/client/TracyCallstack.hpp b/public/client/TracyCallstack.hpp index 0b522b73..36e295a1 100644 --- a/public/client/TracyCallstack.hpp +++ b/public/client/TracyCallstack.hpp @@ -8,9 +8,15 @@ #if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5 # include #elif TRACY_HAS_CALLSTACK >= 3 -# include -#endif + #ifdef USE_LIB_UNWIND_BACKTRACE + // libunwind is in general significantly faster than execinfo based backtraces + #define UNW_LOCAL_ONLY + # include + #else + # include + #endif +#endif #ifndef TRACY_HAS_CALLSTACK @@ -127,7 +133,13 @@ static tracy_force_inline void* Callstack( int depth ) assert( depth >= 1 ); auto trace = (uintptr_t*)tracy_malloc( ( 1 + (size_t)depth ) * sizeof( uintptr_t ) ); + +#ifdef USE_LIB_UNWIND_BACKTRACE + size_t num = unw_backtrace( (void**)(trace+1), depth ); +#else const auto num = (size_t)backtrace( (void**)(trace+1), depth ); +#endif + *trace = num; return trace; From 790d28911d872c2fd5583e0c29ae355d4a6cc052 Mon Sep 17 00:00:00 2001 From: Tiago Rodrigues Date: Fri, 10 Nov 2023 17:02:03 -0500 Subject: [PATCH 2/7] Add env var "TRACY_NO_DBHELP_INIT_LOAD" to allow disabling dbghelp loading of DeviceDriver and ProcessModules at startup --- public/client/TracyCallstack.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/public/client/TracyCallstack.cpp b/public/client/TracyCallstack.cpp index 0de7c9d2..2202724b 100644 --- a/public/client/TracyCallstack.cpp +++ b/public/client/TracyCallstack.cpp @@ -157,9 +157,20 @@ void InitCallstack() SymInitialize( GetCurrentProcess(), nullptr, true ); SymSetOptions( SYMOPT_LOAD_LINES ); + // use TRACY_NO_DBHELP_INIT_LOAD=1 to disable preloading of driver + // and process module symbol loading at startup time - they will be loaded on demand later + // Sometimes this process can take a very long time and prevent resolving callstack frames + // symbols during that time. + const char* noInitLoadEnv = GetEnvVar("TRACY_NO_DBHELP_INIT_LOAD"); + const bool initTimeLoadModules = !(noInitLoadEnv && noInitLoadEnv[0] == '1'); + if (!initTimeLoadModules) + { + printf("TRACY: skipping init dbhelper module load\n"); + } + DWORD needed; LPVOID dev[4096]; - if( EnumDeviceDrivers( dev, sizeof(dev), &needed ) != 0 ) + if( initTimeLoadModules && (EnumDeviceDrivers( dev, sizeof(dev), &needed ) != 0) ) { char windir[MAX_PATH]; if( !GetWindowsDirectoryA( windir, sizeof( windir ) ) ) memcpy( windir, "c:\\windows", 11 ); @@ -214,7 +225,7 @@ void InitCallstack() HANDLE proc = GetCurrentProcess(); HMODULE mod[1024]; - if( EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 ) + if( initTimeLoadModules && (EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0) ) { const auto sz = needed / sizeof( HMODULE ); for( size_t i=0; i Date: Fri, 10 Nov 2023 17:02:47 -0500 Subject: [PATCH 3/7] Add "TRACY_NO_SYS_TRACE" env var to allow force disabling system trace even if the underlying system supports it --- public/client/TracyProfiler.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/public/client/TracyProfiler.cpp b/public/client/TracyProfiler.cpp index a13e575a..37d0d500 100644 --- a/public/client/TracyProfiler.cpp +++ b/public/client/TracyProfiler.cpp @@ -1439,7 +1439,16 @@ Profiler::Profiler() void Profiler::SpawnWorkerThreads() { #ifdef TRACY_HAS_SYSTEM_TRACING - if( SysTraceStart( m_samplingPeriod ) ) + // use TRACY_NO_SYS_TRACE=1 to force disabling sys tracing + // (even if available in the underlying system) + // as it can have significant impact on the size of the traces + const char* noSysTrace = GetEnvVar( "TRACY_NO_SYS_TRACE" ); + const bool disableSystrace = (noSysTrace && noSysTrace[0] == '1'); + if(disableSystrace) + { + printf("TRACY: systrace was disabled by 'TRACY_NO_SYS_TRACE=1'\n"); + } + else if( SysTraceStart( m_samplingPeriod ) ) { s_sysTraceThread = (Thread*)tracy_malloc( sizeof( Thread ) ); new(s_sysTraceThread) Thread( SysTraceWorker, nullptr ); From 2988d0a136f3e051dcd5381e5c2b8a1f6908db3a Mon Sep 17 00:00:00 2001 From: trodrigues Date: Fri, 10 Nov 2023 16:17:39 -0600 Subject: [PATCH 4/7] rename libunwind option and add it to cmake --- CMakeLists.txt | 1 + public/client/TracyCallstack.hpp | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4cb71bdd..7bf17c31 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,6 +84,7 @@ set_option(TRACY_MANUAL_LIFETIME "Enable the manual lifetime management of the p set_option(TRACY_FIBERS "Enable fibers support" OFF) set_option(TRACY_NO_CRASH_HANDLER "Disable crash handling" OFF) set_option(TRACY_TIMER_FALLBACK "Use lower resolution timers" OFF) +set_option(TRACE_CLIENT_LIBUNWIND_BACKTRACE "Use libunwind backtracing where supported" OFF) if(NOT TRACY_STATIC) target_compile_definitions(TracyClient PRIVATE TRACY_EXPORTS) diff --git a/public/client/TracyCallstack.hpp b/public/client/TracyCallstack.hpp index 36e295a1..b24d7adf 100644 --- a/public/client/TracyCallstack.hpp +++ b/public/client/TracyCallstack.hpp @@ -8,7 +8,7 @@ #if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5 # include #elif TRACY_HAS_CALLSTACK >= 3 - #ifdef USE_LIB_UNWIND_BACKTRACE + #ifdef TRACE_CLIENT_LIBUNWIND_BACKTRACE // libunwind is in general significantly faster than execinfo based backtraces #define UNW_LOCAL_ONLY # include @@ -134,7 +134,7 @@ static tracy_force_inline void* Callstack( int depth ) auto trace = (uintptr_t*)tracy_malloc( ( 1 + (size_t)depth ) * sizeof( uintptr_t ) ); -#ifdef USE_LIB_UNWIND_BACKTRACE +#ifdef TRACE_CLIENT_LIBUNWIND_BACKTRACE size_t num = unw_backtrace( (void**)(trace+1), depth ); #else const auto num = (size_t)backtrace( (void**)(trace+1), depth ); From e4b5395ae8862739e56b166b002e15558ec94c6d Mon Sep 17 00:00:00 2001 From: Tiago Rodrigues Date: Mon, 13 Nov 2023 12:57:34 -0500 Subject: [PATCH 5/7] Update documentation with new compile time and env variables added. --- manual/techdoc.tex | 4 ++++ manual/tracy.tex | 10 +++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/manual/techdoc.tex b/manual/techdoc.tex index d3da411e..02189d4d 100644 --- a/manual/techdoc.tex +++ b/manual/techdoc.tex @@ -258,12 +258,16 @@ This is a very OS-specific task. It is split into two parts: getting the call st On some platforms a bit of setup work is required. This is done in the \texttt{InitCallstack()} function. +On windows, tracy will attempt to preload symbols at \texttt{InitCallstack()} time. It does this for device drivers and process modules. As this process can be slow when a lot of pdbs are involved, you can set the \texttt{TRACY\_NO\_DBHELP\_INIT\_LOAD} environment variable to "1" to disable this behavior and rely on-demand symbol loading. + \subsubsection{Getting the frames} Call stack collection is initiated by calling the \texttt{Callstack()} procedure, with maximum stack depth to be collected passed as a parameter. Stack unwinding must be performed in the place in which call stack was queried, as further execution of the application will change the stack contents. The unfortunate part is that the stack unwinding on platforms other than x86 is not a fast operation. To perform unwinding various OS functions are used: \texttt{RtlWalkFrameChain()}, \texttt{\_Unwind\_Backtrace()}, \texttt{backtrace()}. A list of returned frame pointers is saved in a buffer, which will be later sent to the server. The maximum unwinding depth limit (63 entries) is due to the specifics of the underlying OS functionality. +On some platforms you can define \texttt{TRACE\_CLIENT\_LIBUNWIND\_BACKTRACE} to use libunwind to perform callstack captures, as it might be a faster alternative than the default implementation. If you do, you must compile/link you client against libunwind. See \url{https://github.com/libunwind/libunwind} for more details. + \subsubsection{Decoding stack frames} Unlike the always changing call stack, stack frames themselves are immutable pointers to a specific place in the executable code. As such, the decoding process can be performed at any time (even outside of the program execution, as exemplified by debuggers). Frame decoding is only performed when the server asks for the details of a frame (section~\ref{communicationsprotocol}). diff --git a/manual/tracy.tex b/manual/tracy.tex index 553ad6d4..2563ec38 100644 --- a/manual/tracy.tex +++ b/manual/tracy.tex @@ -1698,6 +1698,14 @@ logo=\bclampe Tracy will prepare for call stack collection regardless of whether you use the functionality or not. In some cases, this may be unwanted or otherwise troublesome for the user. To disable support for collecting call stacks, define the \texttt{TRACY\_NO\_CALLSTACK} macro. \end{bclogo} +\begin{bclogo}[ +noborder=true, +couleur=black!5, +logo=\bclampe +]{libunwind} +On some platforms you can define \texttt{TRACE\_CLIENT\_LIBUNWIND\_BACKTRACE} to use libunwind to perform callstack captures as it might be a faster alternative than the default implementation. If you do, you must compile/link you client against libunwind. See \url{https://github.com/libunwind/libunwind} for more details. +\end{bclogo} + \subsubsection{Debugging symbols} You must compile the profiled application with debugging symbols enabled to have correct call stack information. You can achieve that in the following way: @@ -2049,7 +2057,7 @@ Tracy will perform an automatic collection of system data without user intervent Some profiling data can only be retrieved using the kernel facilities, which are not available to users with normal privilege level. To collect such data, you will need to elevate your rights to the administrator level. You can do so either by running the profiled program from the \texttt{root} account on Unix or through the \emph{Run as administrator} option on Windows\footnote{To make this easier, you can run MSVC with admin privileges, which will be inherited by your program when you start it from within the IDE.}. On Android, you will need to have a rooted device (see section~\ref{androidlunacy} for additional information). -As this system-level tracing functionality is part of the automated collection process, no user intervention is necessary to enable it (assuming that the program was granted the rights needed). However, if, for some reason, you would want to prevent your application from trying to access kernel data, you may recompile your program with the \texttt{TRACY\_NO\_SYSTEM\_TRACING} define. +As this system-level tracing functionality is part of the automated collection process, no user intervention is necessary to enable it (assuming that the program was granted the rights needed). However, if, for some reason, you would want to prevent your application from trying to access kernel data, you may recompile your program with the \texttt{TRACY\_NO\_SYSTEM\_TRACING} define. If you want to disable this functionality dynamically at runtime instead, you can set the \texttt{TRACY\_NO\_SYSTEM\_TRACING} environment variable to "1". \begin{bclogo}[ noborder=true, From 5f60ac7ad204943dc2a527b9243a6618f51734e5 Mon Sep 17 00:00:00 2001 From: Tiago Rodrigues Date: Mon, 13 Nov 2023 13:42:30 -0500 Subject: [PATCH 6/7] update docs --- manual/techdoc.tex | 2 +- manual/tracy.tex | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/manual/techdoc.tex b/manual/techdoc.tex index 02189d4d..6492e81a 100644 --- a/manual/techdoc.tex +++ b/manual/techdoc.tex @@ -258,7 +258,7 @@ This is a very OS-specific task. It is split into two parts: getting the call st On some platforms a bit of setup work is required. This is done in the \texttt{InitCallstack()} function. -On windows, tracy will attempt to preload symbols at \texttt{InitCallstack()} time. It does this for device drivers and process modules. As this process can be slow when a lot of pdbs are involved, you can set the \texttt{TRACY\_NO\_DBHELP\_INIT\_LOAD} environment variable to "1" to disable this behavior and rely on-demand symbol loading. +On Windows, tracy will attempt to preload symbols at \texttt{InitCallstack()} time. It does this for device drivers and process modules. As this process can be slow when a lot of pdbs are involved, you can set the \texttt{TRACY\_NO\_DBHELP\_INIT\_LOAD} environment variable to "1" to disable this behavior and rely on-demand symbol loading. \subsubsection{Getting the frames} diff --git a/manual/tracy.tex b/manual/tracy.tex index 2563ec38..742a8368 100644 --- a/manual/tracy.tex +++ b/manual/tracy.tex @@ -1776,6 +1776,8 @@ void DbgHelpUnlock() { ReleaseMutex(dbgHelpLock); } } \end{lstlisting} +At initilization time, tracy will attempt to preload symbols for device drivers and process modules. As this process can be slow when a lot of pdbs are involved, you can set the \texttt{TRACY\_NO\_DBHELP\_INIT\_LOAD} environment variable to "1" to disable this behavior and rely on-demand symbol loading. + \paragraph{Disabling resolution of inline frames} Inline frames retrieval on Windows can be multiple orders of magnitude slower than just performing essential symbol resolution. This manifests as profiler seemingly being stuck for a long time, having hundreds of thousands of query backlog entries queued, which are slowly trickling down. If your use case requires speed of operation rather than having call stacks with inline frames included, you may define the \texttt{TRACY\_NO\_CALLSTACK\_INLINES} macro, which will make the profiler stick to the basic but fast frame resolution mode. From c373647dae5c967813e8c58722db47d834498fdf Mon Sep 17 00:00:00 2001 From: Tiago Rodrigues Date: Mon, 13 Nov 2023 13:43:03 -0500 Subject: [PATCH 7/7] fix coding style --- public/client/TracyCallstack.cpp | 12 ++++++------ public/client/TracyCallstack.hpp | 15 +++++++-------- public/client/TracyProfiler.cpp | 7 +++---- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/public/client/TracyCallstack.cpp b/public/client/TracyCallstack.cpp index 2202724b..bef85431 100644 --- a/public/client/TracyCallstack.cpp +++ b/public/client/TracyCallstack.cpp @@ -161,16 +161,16 @@ void InitCallstack() // and process module symbol loading at startup time - they will be loaded on demand later // Sometimes this process can take a very long time and prevent resolving callstack frames // symbols during that time. - const char* noInitLoadEnv = GetEnvVar("TRACY_NO_DBHELP_INIT_LOAD"); - const bool initTimeLoadModules = !(noInitLoadEnv && noInitLoadEnv[0] == '1'); - if (!initTimeLoadModules) + const char* noInitLoadEnv = GetEnvVar( "TRACY_NO_DBHELP_INIT_LOAD" ); + const bool initTimeModuleLoad = !( noInitLoadEnv && noInitLoadEnv[0] == '1' ); + if ( !initTimeModuleLoad ) { - printf("TRACY: skipping init dbhelper module load\n"); + TracyDebug("TRACY: skipping init time dbghelper module load\n"); } DWORD needed; LPVOID dev[4096]; - if( initTimeLoadModules && (EnumDeviceDrivers( dev, sizeof(dev), &needed ) != 0) ) + if( initTimeModuleLoad && EnumDeviceDrivers( dev, sizeof(dev), &needed ) != 0 ) { char windir[MAX_PATH]; if( !GetWindowsDirectoryA( windir, sizeof( windir ) ) ) memcpy( windir, "c:\\windows", 11 ); @@ -225,7 +225,7 @@ void InitCallstack() HANDLE proc = GetCurrentProcess(); HMODULE mod[1024]; - if( initTimeLoadModules && (EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0) ) + if( initTimeModuleLoad && EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 ) { const auto sz = needed / sizeof( HMODULE ); for( size_t i=0; i #elif TRACY_HAS_CALLSTACK >= 3 - #ifdef TRACE_CLIENT_LIBUNWIND_BACKTRACE - // libunwind is in general significantly faster than execinfo based backtraces - #define UNW_LOCAL_ONLY - # include - #else - # include - #endif - +# ifdef TRACE_CLIENT_LIBUNWIND_BACKTRACE + // libunwind is, in general, significantly faster than execinfo based backtraces +# define UNW_LOCAL_ONLY +# include +# else +# include +# endif #endif #ifndef TRACY_HAS_CALLSTACK diff --git a/public/client/TracyProfiler.cpp b/public/client/TracyProfiler.cpp index 37d0d500..d5ad959f 100644 --- a/public/client/TracyProfiler.cpp +++ b/public/client/TracyProfiler.cpp @@ -1439,14 +1439,13 @@ Profiler::Profiler() void Profiler::SpawnWorkerThreads() { #ifdef TRACY_HAS_SYSTEM_TRACING - // use TRACY_NO_SYS_TRACE=1 to force disabling sys tracing - // (even if available in the underlying system) + // use TRACY_NO_SYS_TRACE=1 to force disabling sys tracing (even if available in the underlying system) // as it can have significant impact on the size of the traces const char* noSysTrace = GetEnvVar( "TRACY_NO_SYS_TRACE" ); const bool disableSystrace = (noSysTrace && noSysTrace[0] == '1'); - if(disableSystrace) + if( disableSystrace ) { - printf("TRACY: systrace was disabled by 'TRACY_NO_SYS_TRACE=1'\n"); + TracyDebug("TRACY: Sys Trace was disabled by 'TRACY_NO_SYS_TRACE=1'\n"); } else if( SysTraceStart( m_samplingPeriod ) ) {