diff --git a/client/TracySysTrace.cpp b/client/TracySysTrace.cpp index 2f1612c7..25d56e42 100644 --- a/client/TracySysTrace.cpp +++ b/client/TracySysTrace.cpp @@ -678,7 +678,9 @@ static int perf_event_open( struct perf_event_attr* hw_event, pid_t pid, int cpu enum TraceEventId { - EventCallstack + EventCallstack, + EventCpuCycles, + EventInstructionsRetired }; static void SetupSampling( int64_t& samplingPeriod ) @@ -690,22 +692,20 @@ static void SetupSampling( int64_t& samplingPeriod ) samplingPeriod = GetSamplingPeriod(); s_numCpus = (int)std::thread::hardware_concurrency(); - s_ring = (RingBuffer*)tracy_malloc( sizeof( RingBuffer ) * s_numCpus ); + s_ring = (RingBuffer*)tracy_malloc( sizeof( RingBuffer ) * s_numCpus * 3 ); s_numBuffers = 0; + // Stack traces perf_event_attr pe = {}; - pe.type = PERF_TYPE_SOFTWARE; pe.size = sizeof( perf_event_attr ); pe.config = PERF_COUNT_SW_CPU_CLOCK; - pe.sample_freq = GetSamplingFrequency(); pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN; #if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 ) pe.sample_max_stack = 127; #endif pe.exclude_callchain_kernel = 1; - pe.disabled = 1; pe.freq = 1; #if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) @@ -726,6 +726,46 @@ static void SetupSampling( int64_t& samplingPeriod ) s_numBuffers++; } + // CPU cycles + pe = {}; + pe.type = PERF_TYPE_HARDWARE; + pe.size = sizeof( perf_event_attr ); + pe.config = PERF_COUNT_HW_CPU_CYCLES; + pe.sample_freq = 25*1000*1000; + pe.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_TIME; + pe.disabled = 1; + pe.exclude_kernel = 1; + pe.exclude_idle = 1; + pe.precise_ip = 2; +#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + pe.use_clockid = 1; + pe.clockid = CLOCK_MONOTONIC_RAW; +#endif + + for( int i=0; i( fd, EventCpuCycles ); + s_numBuffers++; + } + } + + // Instructions retired + pe.config = PERF_COUNT_HW_INSTRUCTIONS; + + for( int i=0; i( fd, EventInstructionsRetired ); + s_numBuffers++; + } + } + + s_threadSampling = (Thread*)tracy_malloc( sizeof( Thread ) ); new(s_threadSampling) Thread( [] (void*) { ThreadExitHandler threadExitHandler; @@ -760,77 +800,133 @@ static void SetupSampling( int64_t& samplingPeriod ) s_ring[i].Read( &hdr, 0, sizeof( perf_event_header ) ); if( hdr.type == PERF_RECORD_SAMPLE ) { - uint32_t pid, tid; - uint64_t t0; - uint64_t cnt; - auto offset = sizeof( perf_event_header ); - s_ring[i].Read( &pid, offset, sizeof( uint32_t ) ); - if( pid == currentPid ) + const auto id = s_ring[i].GetId(); + if( id == EventCallstack ) { - offset += sizeof( uint32_t ); - s_ring[i].Read( &tid, offset, sizeof( uint32_t ) ); - offset += sizeof( uint32_t ); - s_ring[i].Read( &t0, offset, sizeof( uint64_t ) ); - offset += sizeof( uint64_t ); - s_ring[i].Read( &cnt, offset, sizeof( uint64_t ) ); - offset += sizeof( uint64_t ); + // Layout: + // u32 pid, tid + // u64 time + // u64 cnt + // u64 ip[cnt] - if( cnt > 0 ) + uint32_t pid; + s_ring[i].Read( &pid, offset, sizeof( uint32_t ) ); + if( pid == currentPid ) { - auto trace = (uint64_t*)tracy_malloc( ( 1 + cnt ) * sizeof( uint64_t ) ); - s_ring[i].Read( trace+1, offset, sizeof( uint64_t ) * cnt ); + uint32_t tid; + uint64_t t0; + uint64_t cnt; + + offset += sizeof( uint32_t ); + s_ring[i].Read( &tid, offset, sizeof( uint32_t ) ); + offset += sizeof( uint32_t ); + s_ring[i].Read( &t0, offset, sizeof( uint64_t ) ); + offset += sizeof( uint64_t ); + s_ring[i].Read( &cnt, offset, sizeof( uint64_t ) ); + offset += sizeof( uint64_t ); + + if( cnt > 0 ) + { + auto trace = (uint64_t*)tracy_malloc( ( 1 + cnt ) * sizeof( uint64_t ) ); + s_ring[i].Read( trace+1, offset, sizeof( uint64_t ) * cnt ); #if defined __x86_64__ || defined _M_X64 - // remove non-canonical pointers - do - { - const auto test = (int64_t)trace[cnt]; - const auto m1 = test >> 63; - const auto m2 = test >> 47; - if( m1 == m2 ) break; - } - while( --cnt > 0 ); - for( uint64_t j=1; j> 63; - const auto m2 = test >> 47; - if( m1 != m2 ) trace[j] = 0; - } + // remove non-canonical pointers + do + { + const auto test = (int64_t)trace[cnt]; + const auto m1 = test >> 63; + const auto m2 = test >> 47; + if( m1 == m2 ) break; + } + while( --cnt > 0 ); + for( uint64_t j=1; j> 63; + const auto m2 = test >> 47; + if( m1 != m2 ) trace[j] = 0; + } #endif - // skip kernel frames - uint64_t j; - for( j=0; j= 0 ) break; - } - if( j == cnt ) - { - tracy_free( trace ); - } - else - { - if( j > 0 ) + // skip kernel frames + uint64_t j; + for( j=0; j= 0 ) break; } - memcpy( trace, &cnt, sizeof( uint64_t ) ); + if( j == cnt ) + { + tracy_free( trace ); + } + else + { + if( j > 0 ) + { + cnt -= j; + memmove( trace+1, trace+1+j, sizeof( uint64_t ) * cnt ); + } + memcpy( trace, &cnt, sizeof( uint64_t ) ); #if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) - t0 = s_ring[i].ConvertTimeToTsc( t0 ); + t0 = s_ring[i].ConvertTimeToTsc( t0 ); #endif - TracyLfqPrepare( QueueType::CallstackSample ); - MemWrite( &item->callstackSampleFat.time, t0 ); - MemWrite( &item->callstackSampleFat.thread, (uint64_t)tid ); - MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace ); - TracyLfqCommit; + TracyLfqPrepare( QueueType::CallstackSample ); + MemWrite( &item->callstackSampleFat.time, t0 ); + MemWrite( &item->callstackSampleFat.thread, (uint64_t)tid ); + MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace ); + TracyLfqCommit; + } } } } + else + { + // Layout: + // u64 ip + // u32 pid, tid + // u64 time + + uint32_t pid; + s_ring[i].Read( &pid, offset + sizeof( uint64_t ), sizeof( uint32_t ) ); + if( pid == currentPid ) + { + uint64_t ip, t0; + uint32_t tid; + + s_ring[i].Read( &ip, offset, sizeof( uint64_t ) ); + offset += sizeof( uint64_t ) + sizeof( uint32_t ); + s_ring[i].Read( &tid, offset, sizeof( uint32_t ) ); + offset += sizeof( uint32_t ); + s_ring[i].Read( &t0, offset, sizeof( uint64_t ) ); + +#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + t0 = s_ring[i].ConvertTimeToTsc( t0 ); +#endif + + QueueType type; + switch( id ) + { + case EventCpuCycles: + type = QueueType::HwSampleCpuCycle; + break; + case EventInstructionsRetired: + type = QueueType::HwSampleInstructionRetired; + break; + default: + assert( false ); + break; + } + + TracyLfqPrepare( type ); + MemWrite( &item->hwSample.ip, ip ); + MemWrite( &item->hwSample.thread, (uint64_t)tid ); + MemWrite( &item->hwSample.time, t0 ); + TracyLfqCommit; + } + } } s_ring[i].Advance( hdr.size ); } diff --git a/common/TracyProtocol.hpp b/common/TracyProtocol.hpp index 2326a7f3..3cc90ccf 100644 --- a/common/TracyProtocol.hpp +++ b/common/TracyProtocol.hpp @@ -9,7 +9,7 @@ namespace tracy constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; } -enum : uint32_t { ProtocolVersion = 46 }; +enum : uint32_t { ProtocolVersion = 47 }; enum : uint16_t { BroadcastVersion = 2 }; using lz4sz_t = uint32_t; diff --git a/common/TracyQueue.hpp b/common/TracyQueue.hpp index a7690ddf..397057ba 100644 --- a/common/TracyQueue.hpp +++ b/common/TracyQueue.hpp @@ -82,6 +82,8 @@ enum class QueueType : uint8_t CodeInformation, SysTimeReport, TidToPid, + HwSampleCpuCycle, + HwSampleInstructionRetired, PlotConfig, ParamSetup, AckServerQueryNoop, @@ -473,6 +475,13 @@ struct QueueTidToPid uint64_t pid; }; +struct QueueHwSample +{ + int64_t time; + uint64_t thread; + uint64_t ip; +}; + enum class PlotFormatType : uint8_t { Number, @@ -567,6 +576,7 @@ struct QueueItem QueueContextSwitch contextSwitch; QueueThreadWakeup threadWakeup; QueueTidToPid tidToPid; + QueueHwSample hwSample; QueuePlotConfig plotConfig; QueueParamSetup paramSetup; QueueCpuTopology cpuTopology; @@ -653,6 +663,8 @@ static constexpr size_t QueueDataSize[] = { sizeof( QueueHeader ) + sizeof( QueueCodeInformation ), sizeof( QueueHeader ) + sizeof( QueueSysTime ), sizeof( QueueHeader ) + sizeof( QueueTidToPid ), + sizeof( QueueHeader ) + sizeof( QueueHwSample ), // cpu cycle + sizeof( QueueHeader ) + sizeof( QueueHwSample ), // instruction retired sizeof( QueueHeader ) + sizeof( QueuePlotConfig ), sizeof( QueueHeader ) + sizeof( QueueParamSetup ), sizeof( QueueHeader ), // server query acknowledgement