1
0
mirror of https://github.com/wolfpld/tracy.git synced 2025-03-20 07:40:02 +08:00
tracy/public/tracy/TracyMetal.hmm
Marcos Slomp 04cf494d9c bugfixes
2024-08-26 08:15:47 -07:00

519 lines
22 KiB
Plaintext

#ifndef __TRACYMETAL_HMM__
#define __TRACYMETAL_HMM__
#ifndef TRACY_ENABLE
#define TracyMetalContext(device) nullptr
#define TracyMetalDestroy(ctx)
#define TracyMetalContextName(ctx, name, size)
#define TracyMetalZone(ctx, name)
#define TracyMetalZoneC(ctx, name, color)
#define TracyMetalNamedZone(ctx, varname, name, active)
#define TracyMetalNamedZoneC(ctx, varname, name, color, active)
#define TracyMetalZoneTransient(ctx, varname, name, active)
#define TracyMetalZoneS(ctx, name, depth)
#define TracyMetalZoneCS(ctx, name, color, depth)
#define TracyMetalNamedZoneS(ctx, varname, name, depth, active)
#define TracyMetalNamedZoneCS(ctx, varname, name, color, depth, active)
#define TracyMetalZoneTransientS(ctx, varname, name, depth, active)
#define TracyMetalCollect(ctx)
namespace tracy
{
class MetalZoneScope {};
}
using TracyMetalCtx = void*;
#else
#include <atomic>
#include <assert.h>
#include <stdlib.h>
#include "Tracy.hpp"
#include "../client/TracyProfiler.hpp"
#include "../client/TracyCallstack.hpp"
#include "../common/TracyAlign.hpp"
#include "../common/TracyAlloc.hpp"
// ok to import if in obj-c code
#import <Metal/Metal.h>
#define TracyMetalPanic(msg, ...) do { assert(false && "TracyMetal: " msg); TracyMessageLC("TracyMetal: " msg, tracy::Color::Red4); fprintf(stderr, "TracyMetal: %s\n", msg); __VA_ARGS__; } while(false);
namespace tracy
{
class MetalCtx
{
friend class MetalZoneScope;
enum { MaxQueries = 4 * 1024 }; // Metal: between 8 and 32768 _BYTES_...
public:
MetalCtx(id<MTLDevice> device)
: m_device(device)
{
if (m_device == nil)
{
TracyMetalPanic("device is nil.", return);
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary])
{
TracyMetalPanic("timestamp sampling at pipeline stage boundary is not supported.", return);
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary])
{
TracyMetalPanic("timestamp sampling at draw call boundary is not supported.", /* return */);
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary])
{
TracyMetalPanic("timestamp sampling at blit boundary is not supported.", /* return */);
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary])
{
TracyMetalPanic("timestamp sampling at compute dispatch boundary is not supported.", /* return */);
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtTileDispatchBoundary])
{
TracyMetalPanic("timestamp sampling at tile dispatch boundary is not supported.", /* return */);
}
id<MTLCounterSet> timestampCounterSet = nil;
for (id<MTLCounterSet> counterSet in m_device.counterSets)
{
if ([counterSet.name isEqualToString:MTLCommonCounterSetTimestamp])
{
timestampCounterSet = counterSet;
break;
}
}
if (timestampCounterSet == nil)
{
TracyMetalPanic("timestamp counters are not supported on the platform.", return);
}
MTLCounterSampleBufferDescriptor* sampleDescriptor = [[MTLCounterSampleBufferDescriptor alloc] init];
sampleDescriptor.counterSet = timestampCounterSet;
sampleDescriptor.sampleCount = MaxQueries;
sampleDescriptor.storageMode = MTLStorageModeShared;
sampleDescriptor.label = @"TracyMetalTimestampPool";
NSError* error = nil;
id<MTLCounterSampleBuffer> counterSampleBuffer = [m_device newCounterSampleBufferWithDescriptor:sampleDescriptor error:&error];
if (error != nil)
{
NSLog(@"%@", error.localizedDescription);
NSLog(@"%@", error.localizedFailureReason);
TracyMetalPanic("unable to create sample buffer for timestamp counters.", return);
}
m_counterSampleBuffer = counterSampleBuffer;
MTLTimestamp cpuTimestamp = 0;
MTLTimestamp gpuTimestamp = 0;
[m_device sampleTimestamps:&cpuTimestamp gpuTimestamp:&gpuTimestamp];
fprintf(stdout, "TracyMetal: Calibration: CPU timestamp: %llu\n", cpuTimestamp);
fprintf(stdout, "TracyMetal: Calibration: GPU timestamp: %llu\n", gpuTimestamp);
cpuTimestamp = Profiler::GetTime();
fprintf(stdout, "TracyMetal: Calibration: CPU timestamp (profiler): %llu\n", cpuTimestamp);
float period = 1.0f;
m_contextId = GetGpuCtxCounter().fetch_add(1);
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuNewContext);
MemWrite(&item->gpuNewContext.cpuTime, int64_t(cpuTimestamp));
MemWrite(&item->gpuNewContext.gpuTime, int64_t(gpuTimestamp));
MemWrite(&item->gpuNewContext.thread, uint32_t(0)); // #TODO: why not GetThreadHandle()?
MemWrite(&item->gpuNewContext.period, period);
MemWrite(&item->gpuNewContext.context, m_contextId);
//MemWrite(&item->gpuNewContext.flags, GpuContextCalibration);
MemWrite(&item->gpuNewContext.flags, GpuContextFlags(0));
MemWrite(&item->gpuNewContext.type, GpuContextType::Metal);
Profiler::QueueSerialFinish(); // TODO: DeferItem() for TRACY_ON_DEMAND
}
~MetalCtx()
{
}
static MetalCtx* Create(id<MTLDevice> device)
{
auto ctx = static_cast<MetalCtx*>(tracy_malloc(sizeof(MetalCtx)));
new (ctx) MetalCtx(device);
if (ctx->m_contextId == 255)
{
TracyMetalPanic("error during context creation.", Destroy(ctx); return nullptr);
}
return ctx;
}
static void Destroy(MetalCtx* ctx)
{
ctx->~MetalCtx();
tracy_free(ctx);
}
void Name( const char* name, uint16_t len )
{
auto ptr = (char*)tracy_malloc( len );
memcpy( ptr, name, len );
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuContextName );
MemWrite( &item->gpuContextNameFat.context, m_contextId );
MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
MemWrite( &item->gpuContextNameFat.size, len );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
Profiler::QueueSerialFinish();
}
bool Collect()
{
ZoneScopedNC("TracyMetal::Collect", Color::Red4);
#ifdef TRACY_ON_DEMAND
if (!GetProfiler().IsConnected())
{
return true;
}
#endif
// Only one thread is allowed to collect timestamps at any given time
// but there's no need to block contending threads
if (!m_collectionMutex.try_lock())
{
return true;
}
std::unique_lock lock (m_collectionMutex, std::adopt_lock);
uintptr_t begin = m_previousCheckpoint.load();
uintptr_t latestCheckpoint = m_queryCounter.load(); // TODO: MTLEvent? MTLFence?;
uint32_t count = RingCount(begin, latestCheckpoint);
ZoneValue(begin);
ZoneValue(latestCheckpoint);
if (count == 0) // no pending timestamp queries
{
//uintptr_t nextCheckpoint = m_queryCounter.load();
//if (nextCheckpoint != latestCheckpoint)
//{
// // TODO: signal event / fence now?
//}
return true;
}
if (RingIndex(begin) + count > RingSize())
{
count = RingSize() - RingIndex(begin);
}
ZoneValue(count);
if (count >= MaxQueries)
{
fprintf(stdout, "TracyMetal: Collect: FULL [%llu, %llu] (%u)\n", begin, latestCheckpoint, count);
TracyMetalPanic("Collect: too many pending timestamp queries.", return false;);
}
NSRange range = NSMakeRange(RingIndex(begin), count);
NSData* data = [m_counterSampleBuffer resolveCounterRange:range];
NSUInteger numResolvedTimestamps = data.length / sizeof(MTLCounterResultTimestamp);
MTLCounterResultTimestamp* timestamps = (MTLCounterResultTimestamp *)(data.bytes);
if (timestamps == nil)
{
TracyMetalPanic("Collect: unable to resolve timestamps.", return false;);
}
if (numResolvedTimestamps != count)
{
fprintf(stdout, "TracyMetal: Collect: numResolvedTimestamps != count : %u != %u\n", numResolvedTimestamps, count);
}
for (auto i = 0; i < numResolvedTimestamps; i += 2)
{
static MTLTimestamp lastValidTimestamp = 0;
MTLTimestamp& t_start = timestamps[i+0].timestamp;
MTLTimestamp& t_end = timestamps[i+1].timestamp;
uint32_t k = RingIndex(begin + i);
fprintf(stdout, "TracyMetal: Collect: timestamp[%u] = %llu | timestamp[%u] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start));
if (t_start == MTLCounterErrorValue)
{
TracyMetalPanic("Collect: invalid timestamp: MTLCounterErrorValue (0xFF..FF).");
break;
}
if (t_start == 0 || t_end == 0) // zero is apparently also considered "invalid"...
{
static int HACK_retries = 0;
if (++HACK_retries > 8) {
fprintf(stdout, "TracyMetal: Collect: giving up...\n");
t_start = t_end = lastValidTimestamp + 100;
HACK_retries = 0;
} else {
TracyMetalPanic("Collect: invalid timestamp: zero.");
break;
}
}
m_previousCheckpoint += 2;
{
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuTime);
MemWrite(&item->gpuTime.gpuTime, static_cast<int64_t>(t_start));
MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(k));
MemWrite(&item->gpuTime.context, m_contextId);
Profiler::QueueSerialFinish();
}
{
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuTime);
MemWrite(&item->gpuTime.gpuTime, static_cast<int64_t>(t_end));
MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(k+1));
MemWrite(&item->gpuTime.context, m_contextId);
Profiler::QueueSerialFinish();
}
lastValidTimestamp = t_end;
t_start = t_end = MTLCounterErrorValue; // "reset" timestamps
}
ZoneValue(RingCount(begin, m_previousCheckpoint.load()));
//RecalibrateClocks(); // to account for drift
return true;
}
private:
tracy_force_inline uint32_t RingIndex(uintptr_t index)
{
index %= MaxQueries;
return static_cast<uint32_t>(index);
}
tracy_force_inline uint32_t RingCount(uintptr_t begin, uintptr_t end)
{
// wrap-around safe: all unsigned
uintptr_t count = end - begin;
return static_cast<uint32_t>(count);
}
tracy_force_inline uint32_t RingSize() const
{
return MaxQueries;
}
tracy_force_inline unsigned int NextQueryId(int n=1)
{
ZoneScopedNC("TracyMetal::NextQueryId", tracy::Color::LightCoral);
auto id = m_queryCounter.fetch_add(n);
ZoneValue(id);
auto count = RingCount(m_previousCheckpoint, id);
if (count >= MaxQueries)
{
fprintf(stdout, "TracyMetal: NextQueryId: FULL [%llu, %llu] (%u)\n", m_previousCheckpoint.load(), id, count);
TracyMetalPanic("NextQueryId: too many pending timestamp queries.");
// #TODO: return some sentinel value; ideally a "hidden" query index
}
return RingIndex(id);
}
tracy_force_inline uint8_t GetContextId() const
{
return m_contextId;
}
uint8_t m_contextId = 255;
id<MTLDevice> m_device = nil;
id<MTLCounterSampleBuffer> m_counterSampleBuffer = nil;
using atomic_counter = std::atomic<uintptr_t>;
static_assert(atomic_counter::is_always_lock_free);
atomic_counter m_queryCounter = 0;
atomic_counter m_previousCheckpoint = 0;
atomic_counter::value_type m_nextCheckpoint = 0;
std::mutex m_collectionMutex;
};
class MetalZoneScope
{
public:
tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLComputePassDescriptor* desc, const SourceLocationData* srcloc, bool is_active )
#ifdef TRACY_ON_DEMAND
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( is_active )
#endif
{
if ( !m_active ) return;
if (desc == nil) TracyMetalPanic("pass descriptor is nil.");
m_ctx = ctx;
auto queryId = m_queryId = ctx->NextQueryId(2);
desc.sampleBufferAttachments[0].sampleBuffer = ctx->m_counterSampleBuffer;
desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = queryId;
desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = queryId+1;
auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial );
MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() );
Profiler::QueueSerialFinish();
}
tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLBlitPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active )
#ifdef TRACY_ON_DEMAND
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( is_active )
#endif
{
if ( !m_active ) return;
if (desc == nil) TracyMetalPanic("pass descriptor is nil.");
m_ctx = ctx;
auto queryId = m_queryId = ctx->NextQueryId(2);
desc.sampleBufferAttachments[0].sampleBuffer = ctx->m_counterSampleBuffer;
desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = queryId;
desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = queryId+1;
auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial );
MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() );
Profiler::QueueSerialFinish();
}
tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLRenderPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active )
#ifdef TRACY_ON_DEMAND
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( is_active )
#endif
{
if ( !m_active ) return;
if (desc == nil) TracyMetalPanic("pass descriptor is nil.");
m_ctx = ctx;
auto queryId = m_queryId = ctx->NextQueryId(2);
desc.sampleBufferAttachments[0].sampleBuffer = ctx->m_counterSampleBuffer;
desc.sampleBufferAttachments[0].startOfVertexSampleIndex = queryId;
desc.sampleBufferAttachments[0].endOfVertexSampleIndex = MTLCounterDontSample;
desc.sampleBufferAttachments[0].startOfFragmentSampleIndex = MTLCounterDontSample;
desc.sampleBufferAttachments[0].endOfFragmentSampleIndex = queryId+1;
auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial );
MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() );
Profiler::QueueSerialFinish();
}
#if 0
tracy_force_inline MetalZoneScope( MetalCtx* ctx, id<MTLComputeCommandEncoder> cmdEncoder, const SourceLocationData* srcloc, bool is_active )
#ifdef TRACY_ON_DEMAND
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( is_active )
#endif
{
if( !m_active ) return;
m_ctx = ctx;
m_cmdEncoder = cmdEncoder;
auto queryId = m_queryId = ctx->NextQueryId();
[m_cmdEncoder sampleCountersInBuffer:m_ctx->m_counterSampleBuffer atSampleIndex:queryId withBarrier:YES];
auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial );
MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() );
Profiler::QueueSerialFinish();
}
#endif
tracy_force_inline ~MetalZoneScope()
{
if( !m_active ) return;
auto queryId = m_queryId + 1;
auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial );
MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() );
MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneEnd.context, m_ctx->GetContextId() );
Profiler::QueueSerialFinish();
}
private:
const bool m_active;
MetalCtx* m_ctx;
id<MTLComputeCommandEncoder> m_cmdEncoder;
uint32_t m_queryId = 0;
};
}
using TracyMetalCtx = tracy::MetalCtx*;
#define TracyMetalContext(device) tracy::MetalCtx::Create(device)
#define TracyMetalDestroy(ctx) tracy::MetalCtx::Destroy(ctx)
#define TracyMetalContextName(ctx, name, size) ctx->Name(name, size)
#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
# define TracyMetalZone( ctx, name ) TracyMetalNamedZoneS( ctx, ___tracy_gpu_zone, name, TRACY_CALLSTACK, true )
# define TracyMetalZoneC( ctx, name, color ) TracyMetalNamedZoneCS( ctx, ___tracy_gpu_zone, name, color, TRACY_CALLSTACK, true )
# define TracyMetalNamedZone( ctx, varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active );
# define TracyMetalNamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active );
# define TracyMetalZoneTransient(ctx, varname, name, active) TracyMetalZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active)
#else
# define TracyMetalZone( ctx, cmdEnc, name ) TracyMetalNamedZone( ctx, ___tracy_gpu_zone, cmdEnc, name, true )
# define TracyMetalZoneC( ctx, name, color ) TracyMetalNamedZoneC( ctx, ___tracy_gpu_zone, name, color, true )
# define TracyMetalNamedZone( ctx, varname, cmdEnc, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, cmdEnc, &TracyConcat(__tracy_gpu_source_location,TracyLine), active );
# define TracyMetalNamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active );
# define TracyMetalZoneTransient(ctx, varname, name, active) tracy::MetalZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), active };
#endif
#ifdef TRACY_HAS_CALLSTACK
# define TracyMetalZoneS( ctx, name, depth ) TracyMetalNamedZoneS( ctx, ___tracy_gpu_zone, name, depth, true )
# define TracyMetalZoneCS( ctx, name, color, depth ) TracyMetalNamedZoneCS( ctx, ___tracy_gpu_zone, name, color, depth, true )
# define TracyMetalNamedZoneS( ctx, varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active );
# define TracyMetalNamedZoneCS( ctx, varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active );
# define TracyMetalZoneTransientS(ctx, varname, name, depth, active) tracy::MetalZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), depth, active };
#else
# define TracyMetalZoneS( ctx, name, depth, active ) TracyMetalZone( ctx, name )
# define TracyMetalZoneCS( ctx, name, color, depth, active ) TracyMetalZoneC( name, color )
# define TracyMetalNamedZoneS( ctx, varname, name, depth, active ) TracyMetalNamedZone( ctx, varname, name, active )
# define TracyMetalNamedZoneCS( ctx, varname, name, color, depth, active ) TracyMetalNamedZoneC( ctx, varname, name, color, active )
# define TracyMetalZoneTransientS(ctx, varname, name, depth, active) TracyMetalZoneTransient(ctx, varname, name, active)
#endif
#define TracyMetalCollect( ctx ) ctx->Collect();
#endif
#endif//__TRACYMETAL_HMM__