From 4be5e0bfa13ed980403a9e923d3943a06983459c Mon Sep 17 00:00:00 2001 From: Andrew Depke Date: Sun, 7 Jun 2020 00:25:43 -0600 Subject: [PATCH 01/11] Initial Direct3D 12 profiling implementation --- TracyD3D12.hpp | 311 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 311 insertions(+) create mode 100644 TracyD3D12.hpp diff --git a/TracyD3D12.hpp b/TracyD3D12.hpp new file mode 100644 index 00000000..c9b4b5f7 --- /dev/null +++ b/TracyD3D12.hpp @@ -0,0 +1,311 @@ +#ifndef __TRACYD3D12_HPP__ +#define __TRACYD3D12_HPP__ + +#ifndef TRACY_ENABLE + +#define TracyD3D12Context(device, queue) nullptr +#define TracyD3D12Destroy(ctx) + +#define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) +#define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) +#define TracyD3D12Zone(ctx, cmdList, name) +#define TracyD3D12ZoneC(ctx, cmdList, name, color) + +#define TracyD3D12Collect(ctx) + +namespace tracy +{ + class D3D12ZoneScope {}; +} + +using TracyD3D12Ctx = void*; + +#else + +#include "Tracy.hpp" +#include "client/TracyProfiler.hpp" + +#include +#include +#include +#include +#include + +namespace tracy +{ + + // Command queue context. + class D3D12QueueCtx + { + friend class D3D12ZoneScope; + + static constexpr uint32_t MaxQueries = 64 * 1024; // Queries are begin and end markers, so we can store half as many total time durations. + + bool m_initialized = false; + + ID3D12Device* m_device; + uint8_t m_context; + Microsoft::WRL::ComPtr m_queryHeap; + Microsoft::WRL::ComPtr m_readbackBuffer; + + uint32_t m_queryLimit = MaxQueries; + uint32_t m_queryCounter = 0; + uint32_t m_previousQueryCounter = 0; + + public: + D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue) + : m_device(device) + , m_context(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed)) + { + // Verify we support timestamp queries on this queue. + + if (queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY) + { + D3D12_FEATURE_DATA_D3D12_OPTIONS3 featureData{}; + + if (FAILED(device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &featureData, sizeof(featureData)))) + { + assert(false && "Platform does not support profiling of copy queues."); + } + } + + uint64_t timestampFrequency; + + if (FAILED(queue->GetTimestampFrequency(×tampFrequency))) + { + assert(false && "Failed to get timestamp frequency."); + } + + uint64_t cpuTimestamp; + uint64_t gpuTimestamp; + + if (FAILED(queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp))) + { + assert(false && "Failed to get queue clock calibration."); + } + + cpuTimestamp = Profiler::GetTime(); + + D3D12_QUERY_HEAP_DESC heapDesc{}; + heapDesc.Type = queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY ? D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP : D3D12_QUERY_HEAP_TYPE_TIMESTAMP; + heapDesc.Count = m_queryLimit; + heapDesc.NodeMask = 0; // #TODO: Support multiple adapters. + + while (FAILED(device->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&m_queryHeap)))) + { + m_queryLimit /= 2; + heapDesc.Count = m_queryLimit; + } + + // Create a readback buffer, which will be used as a destination for the query data. + + D3D12_RESOURCE_DESC readbackBufferDesc{}; + readbackBufferDesc.Alignment = 0; + readbackBufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + readbackBufferDesc.Width = m_queryLimit * sizeof(uint64_t); + readbackBufferDesc.Height = 1; + readbackBufferDesc.DepthOrArraySize = 1; + readbackBufferDesc.Format = DXGI_FORMAT_UNKNOWN; + readbackBufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; // Buffers are always row major. + readbackBufferDesc.MipLevels = 1; + readbackBufferDesc.SampleDesc.Count = 1; + readbackBufferDesc.SampleDesc.Quality = 0; + readbackBufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE; + + D3D12_HEAP_PROPERTIES readbackHeapProps{}; + readbackHeapProps.Type = D3D12_HEAP_TYPE_READBACK; + readbackHeapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN; + readbackHeapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN; + readbackHeapProps.CreationNodeMask = 0; + readbackHeapProps.VisibleNodeMask = 0; // #TODO: Support multiple adapters. + + if (FAILED(device->CreateCommittedResource(&readbackHeapProps, D3D12_HEAP_FLAG_NONE, &readbackBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readbackBuffer)))) + { + assert(false && "Failed to create query readback buffer."); + } + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuNewContext); + MemWrite(&item->gpuNewContext.cpuTime, cpuTimestamp); + MemWrite(&item->gpuNewContext.gpuTime, gpuTimestamp); + memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); + MemWrite(&item->gpuNewContext.period, 1E+09f / static_cast(timestampFrequency)); + MemWrite(&item->gpuNewContext.context, m_context); + MemWrite(&item->gpuNewContext.accuracyBits, uint8_t{ 0 }); + MemWrite(&item->gpuNewContext.type, GpuContextType::Vulkan); // #TEMP: Add a Direct3D12 context type in the server. + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem(*item); +#endif + + Profiler::QueueSerialFinish(); + + m_initialized = true; + } + + ~D3D12QueueCtx() {} + + void Collect() + { + ZoneScopedC(Color::Red4); + + // Check to see if we have any new queries. + if (m_queryCounter == m_previousQueryCounter) return; + +#ifdef TRACY_ON_DEMAND + if (!GetProfiler().IsConnected()) + { + m_queryCounter = 0; + + return; + } +#endif + + // Batch submit all of our query data to the profiler. + + // Map the readback buffer so we can fetch the query data from the GPU. + void* readbackBufferMapping = nullptr; + + if (FAILED(m_readbackBuffer->Map(0, nullptr, &readbackBufferMapping))) + { + assert(false && "Failed to map readback buffer."); + } + + auto* timestampData = static_cast(readbackBufferMapping); + + for (uint32_t index = 0; index < m_queryCounter; ++index) + { + const auto timestamp = timestampData[(m_previousQueryCounter + index) % m_queryLimit]; + const auto queryId = m_previousQueryCounter + index; + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, timestamp); + MemWrite(&item->gpuTime.queryId, static_cast(queryId)); + MemWrite(&item->gpuTime.context, m_context); + + Profiler::QueueSerialFinish(); + } + + m_readbackBuffer->Unmap(0, nullptr); + + m_previousQueryCounter += m_queryCounter; + m_queryCounter = 0; + + if (m_previousQueryCounter >= m_queryLimit) + { + m_previousQueryCounter -= m_queryLimit; + } + } + + private: + tracy_force_inline uint32_t NextQueryId() + { + assert(m_queryCounter < m_queryLimit && "Submitted too many GPU queries! Consider increasing MaxQueries."); + + const uint32_t id = (m_previousQueryCounter + m_queryCounter) % m_queryLimit; + ++m_queryCounter; + + return id; + } + + tracy_force_inline uint8_t GetId() const + { + return m_context; + } + }; + + class D3D12ZoneScope + { + const bool m_active; + D3D12QueueCtx* m_ctx = nullptr; + ID3D12GraphicsCommandList* m_cmdList = nullptr; + + public: + tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, bool active) +#ifdef TRACY_ON_DEMAND + : m_active(active && GetProfiler().IsConnected()) +#else + : m_active(active) +#endif + { + if (!m_active) return; + + m_ctx = ctx; + m_cmdList = cmdList; + + const auto queryId = ctx->NextQueryId(); + cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, queryId); + + auto* item = Profiler::QueueSerial(); +#if defined(TRACY_HAS_CALLSTACK) && defined(TRACY_CALLSTACK) + MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial); +#else + MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial); +#endif + MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneBegin.srcloc, reinterpret_cast(srcLocation)); + MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneBegin.queryId, static_cast(queryId)); + MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); + + Profiler::QueueSerialFinish(); + +#if defined(TRACY_HAS_CALLSTACK) && defined(TRACY_CALLSTACK) + GetProfiler().SendCallstack(TRACY_CALLSTACK); +#endif + } + + tracy_force_inline ~D3D12ZoneScope() + { + if (!m_active) return; + + const auto queryId = m_ctx->NextQueryId(); + m_cmdList->EndQuery(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, queryId); + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial); + MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneEnd.queryId, static_cast(queryId)); + MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId()); + + Profiler::QueueSerialFinish(); + + m_cmdList->ResolveQueryData(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, queryId - 1, 2, m_ctx->m_readbackBuffer.Get(), (queryId - 1) * sizeof(uint64_t)); + } + }; + + static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue) + { + InitRPMallocThread(); + + auto* ctx = static_cast(tracy_malloc(sizeof(D3D12QueueCtx))); + new (ctx) D3D12QueueCtx{ device, queue }; + + return ctx; + } + + static inline void DestroyD3D12Context(D3D12QueueCtx* ctx) + { + ctx->~D3D12QueueCtx(); + tracy_free(ctx); + } + +} + +using TracyD3D12Ctx = tracy::D3D12QueueCtx*; + +#define TracyD3D12Context(device, queue) tracy::CreateD3D12Context(device, queue); +#define TracyD3D12Destroy(ctx) tracy::DestroyD3D12Context(ctx); + +#define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, __LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, __LINE__), active }; +#define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, __LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, __LINE__), active }; +#define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZone(ctx, ___tracy_gpu_zone, cmdList, name, true) +#define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneC(ctx, ___tracy_gpu_zone, cmdList, name, color, true) + +#define TracyD3D12Collect(ctx) ctx->Collect(); + +#endif + +#endif From 3282a8d27c62ce9ab4df6120b20e5350991235cd Mon Sep 17 00:00:00 2001 From: Andrew Depke Date: Sun, 7 Jun 2020 00:40:08 -0600 Subject: [PATCH 02/11] Added server support for D3D12 contexts --- TracyD3D12.hpp | 2 +- common/TracyQueue.hpp | 3 ++- server/TracyView.cpp | 5 +++-- server/TracyWorker.cpp | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/TracyD3D12.hpp b/TracyD3D12.hpp index c9b4b5f7..76bd7a87 100644 --- a/TracyD3D12.hpp +++ b/TracyD3D12.hpp @@ -132,7 +132,7 @@ namespace tracy MemWrite(&item->gpuNewContext.period, 1E+09f / static_cast(timestampFrequency)); MemWrite(&item->gpuNewContext.context, m_context); MemWrite(&item->gpuNewContext.accuracyBits, uint8_t{ 0 }); - MemWrite(&item->gpuNewContext.type, GpuContextType::Vulkan); // #TEMP: Add a Direct3D12 context type in the server. + MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12); #ifdef TRACY_ON_DEMAND GetProfiler().DeferItem(*item); diff --git a/common/TracyQueue.hpp b/common/TracyQueue.hpp index f791410b..7ddc9ed6 100644 --- a/common/TracyQueue.hpp +++ b/common/TracyQueue.hpp @@ -263,7 +263,8 @@ enum class GpuContextType : uint8_t { Invalid, OpenGl, - Vulkan + Vulkan, + Direct3D12 }; struct QueueGpuNewContext diff --git a/server/TracyView.cpp b/server/TracyView.cpp index d41ac945..fbf14ccc 100644 --- a/server/TracyView.cpp +++ b/server/TracyView.cpp @@ -75,7 +75,8 @@ constexpr const char* s_tracyStackFrames[] = { constexpr const char* GpuContextNames[] = { "Invalid", "OpenGL", - "Vulkan" + "Vulkan", + "Direct3D 12" }; @@ -2472,7 +2473,7 @@ void View::DrawZones() draw->AddTriangle( wpos + ImVec2( to/2, oldOffset + to/2 ), wpos + ImVec2( to/2, oldOffset + ty - to/2 ), wpos + ImVec2( to/2 + th, oldOffset + ty * 0.5 ), 0xFF886666, 2.0f ); } - const bool isMultithreaded = v->type == GpuContextType::Vulkan; + const bool isMultithreaded = (v->type == GpuContextType::Vulkan || v->type == GpuContextType::Direct3D12); char buf[64]; sprintf( buf, "%s context %zu", GpuContextNames[(int)v->type], i ); DrawTextContrast( draw, wpos + ImVec2( ty, oldOffset ), showFull ? 0xFFFFAAAA : 0xFF886666, buf ); diff --git a/server/TracyWorker.cpp b/server/TracyWorker.cpp index 3dfc6d0d..8baac096 100644 --- a/server/TracyWorker.cpp +++ b/server/TracyWorker.cpp @@ -5265,7 +5265,7 @@ void Worker::ProcessGpuZoneBeginImpl( GpuEvent* zone, const QueueGpuZoneBegin& e uint64_t ztid; if( ctx->thread == 0 ) { - // Vulkan context is not bound to any single thread. + // Vulkan and Direct3D 12 contexts are not bound to any single thread. zone->SetThread( CompressThread( ev.thread ) ); ztid = ev.thread; } From bffcc52536434ac72926a731bcf697b213465173 Mon Sep 17 00:00:00 2001 From: Andrew Depke Date: Sun, 7 Jun 2020 00:51:52 -0600 Subject: [PATCH 03/11] Updated AUTHORS. --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 2719cf57..02914706 100644 --- a/AUTHORS +++ b/AUTHORS @@ -7,3 +7,4 @@ Till Rathmann (DLL support) Sherief Farouk (compatibility fixes) Dedmen Miller (find zone bug fixes, improvements) Michał Cichoń (OSX call stack decoding backport) +Andrew Depke (Direct3D 12 support) From 03993072c5c9a6a6e4393bd4411e3fe9fa07fe01 Mon Sep 17 00:00:00 2001 From: Andrew Depke Date: Sun, 7 Jun 2020 01:03:43 -0600 Subject: [PATCH 04/11] Added mapping range to prevent debug layer warnings --- TracyD3D12.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/TracyD3D12.hpp b/TracyD3D12.hpp index 76bd7a87..10cb8a1e 100644 --- a/TracyD3D12.hpp +++ b/TracyD3D12.hpp @@ -163,10 +163,12 @@ namespace tracy // Batch submit all of our query data to the profiler. + D3D12_RANGE mapRange{ 0, m_queryLimit * sizeof(uint64_t) }; + // Map the readback buffer so we can fetch the query data from the GPU. void* readbackBufferMapping = nullptr; - if (FAILED(m_readbackBuffer->Map(0, nullptr, &readbackBufferMapping))) + if (FAILED(m_readbackBuffer->Map(0, &mapRange, &readbackBufferMapping))) { assert(false && "Failed to map readback buffer."); } From d15b83b669d9b23c9976c377ecc33bd6d41499ec Mon Sep 17 00:00:00 2001 From: Andrew Depke Date: Sun, 7 Jun 2020 02:05:51 -0600 Subject: [PATCH 05/11] Updated manual for Direct3D 12 --- manual/tracy.tex | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/manual/tracy.tex b/manual/tracy.tex index 6fe40bf5..8bda9138 100644 --- a/manual/tracy.tex +++ b/manual/tracy.tex @@ -117,7 +117,7 @@ Hello and welcome to the Tracy Profiler user manual! Here you will find all the \section{A quick look at Tracy Profiler} \label{quicklook} -Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that can be used for remote or embedded telemetry of games and other applications. It can profile CPU (C, C++11, Lua), GPU (OpenGL, Vulkan) and memory. It also can monitor locks held by threads and show where contention does happen. +Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that can be used for remote or embedded telemetry of games and other applications. It can profile CPU (C, C++11, Lua), GPU (OpenGL, Vulkan, Direct3D 12) and memory. It also can monitor locks held by threads and show where contention does happen. While Tracy can perform statistical analysis of sampled call stack data, just like other \emph{statistical profilers} (such as VTune, perf or Very Sleepy), it mainly focuses on manual markup of the source code, which allows frame-by-frame inspection of the program execution. You will be able to see exactly which functions are called, how much time is spent in them, and how do they interact with each other in a multi-threaded environment. In contrast, the statistical analysis may show you the hot spots in your code, but it is unable to accurately pinpoint the underlying cause for semi-random frame stutter that may occur every couple of seconds. @@ -1162,7 +1162,7 @@ This requirement is relaxed in the on-demand mode (section~\ref{ondemand}), beca \subsection{GPU profiling} \label{gpuprofiling} -Tracy provides bindings for profiling OpenGL and Vulkan execution time on GPU. +Tracy provides bindings for profiling OpenGL, Vulkan, and Direct3D 12 execution time on GPU. Note that the CPU and GPU timers may be not synchronized. You can correct the resulting desynchronization in the profiler's options (section~\ref{options}). @@ -1197,11 +1197,21 @@ To mark a GPU zone use the \texttt{TracyVkZone(ctx, cmdbuf, name)} macro, where You also need to periodically collect the GPU events using the \texttt{TracyVkCollect(ctx, cmdbuf)} macro\footnote{It is considerably faster than the OpenGL's \texttt{TracyGpuCollect}.}. The provided command buffer must be in the recording state and outside of a render pass instance. +\subsubsection{Direct3D 12} + +To enable Direct3D 12 support, include the \texttt{tracy/TracyD3D12.hpp} header file. Tracing Direct3D 12 queues is on par with the Vulkan implementation, where a \texttt{TracyD3D12Ctx} is returned from a call to \texttt{TracyD3D12Context(device, queue)}, which should be later cleaned up with the \texttt{TracyD3D12Destroy(ctx)} macro. Multiple contexts can be created, each with any queue type. + +The queue must have been created through the specified device, however a command list is not needed for this stage. + +Using GPU zones is the same as the Vulkan implementation, where the \texttt{TracyD3D12Zone(ctx, cmdList, name)} macro is used, with \texttt{name} as a string literal. \texttt{TracyD3D12ZoneC(ctx, cmdList, name, color)} can be used to create a custom-colored zone. The given command list must be in an open state. + +Event data can then be collected and sent to the profiler using the \texttt{TracyD3D12Collect(ctx)} macro. + \subsubsection{Multiple zones in one scope} Putting more than one GPU zone macro in a single scope features the same issue as with the \texttt{ZoneScoped} macros, described in section~\ref{multizone} (but this time the variable name is \texttt{\_\_\_tracy\_gpu\_zone}). -To solve this problem, in case of OpenGL use the \texttt{TracyGpuNamedZone} macro in place of \texttt{TracyGpuZone} (or the color variant). The same applies to Vulkan -- replace \texttt{TracyVkZone} with \texttt{TracyVkNamedZone}. +To solve this problem, in case of OpenGL use the \texttt{TracyGpuNamedZone} macro in place of \texttt{TracyGpuZone} (or the color variant). The same applies to Vulkan and Direct3D 12 -- replace \texttt{TracyVkZone} with \texttt{TracyVkNamedZone} and \texttt{TracyD3D12Zone} with \texttt{TracyD3D12NamedZone}. Remember that you need to provide your own name for the created stack variable as the first parameter to the macros. @@ -1352,7 +1362,7 @@ Even if Tracy is disabled, you still have to pay the no-op function call cost. T In order to profile code written in C programming language, you will need to include the \texttt{tracy/TracyC.h} header file, which exposes the C API. -At the moment there's no support for C API based markup of locks, OpenGL, Vulkan or Lua. +At the moment there's no support for C API based markup of locks, OpenGL, Vulkan, Direct3D 12, or Lua. \begin{bclogo}[ noborder=true, @@ -2119,7 +2129,7 @@ On this combined view you will find the zones with locks and their associated th The left hand side \emph{index area} of the timeline view displays various labels (threads, locks), which can be categorized in the following way: \begin{itemize} -\item \emph{Light blue label} -- OpenGL/Vulkan context. Multi-threaded Vulkan contexts are additionally split into separate threads. +\item \emph{Light blue label} -- OpenGL/Vulkan/Direct3D context. Multi-threaded Vulkan and Direct3D 12 contexts are additionally split into separate threads. \item \emph{Pink label} -- CPU data graph. \item \emph{White label} -- A CPU thread. Will be replaced by a bright red label in a thread that has crashed (section~\ref{crashhandling}). If automated sampling was performed, clicking the~\LMB{}~left mouse button on the \emph{\faGhost{}~ghost zones} button will switch zone display mode between 'instrumented' and 'ghost'. \item \emph{Light red label} -- Indicates a lock. @@ -2154,7 +2164,7 @@ At high zoom levels, the zones will be displayed with additional markers, as pre \label{inaccuracy} \end{figure} -The GPU zones are displayed just like CPU zones, with an OpenGL/Vulkan context in place of a thread name. +The GPU zones are displayed just like CPU zones, with an OpenGL/Vulkan/Direct3D context in place of a thread name. Hovering the \faMousePointer{} mouse pointer over a zone will highlight all other zones that have the same source location with a white outline. Clicking the \LMB{}~left mouse button on a zone will open zone information window (section~\ref{zoneinfo}). Holding the \keys{\ctrl} key and clicking the \LMB{}~left mouse button on a zone will open zone statistics window (section~\ref{findzone}). Clicking the \MMB{}~middle mouse button on a zone will zoom the view to the extent of the zone. @@ -2294,7 +2304,7 @@ In this window you can set various trace-related options. The timeline view migh \begin{itemize} \item \emph{\faSignature{} Draw CPU usage graph} -- You can disable drawing of the CPU usage graph here. \end{itemize} -\item \emph{\faEye{} Draw GPU zones} -- Allows disabling display of OpenGL/Vulkan zones. The \emph{GPU zones} drop-down allows disabling individual GPU contexts and setting CPU/GPU drift offsets (see section~\ref{gpuprofiling} for more information). The \emph{\faRobot~Auto} button automatically measures the GPU drift value\footnote{There is an assumption that drift is linear. Automated measurement calculates and removes change over time in delay-to-execution of GPU zones. Resulting value may still be incorrect.}. +\item \emph{\faEye{} Draw GPU zones} -- Allows disabling display of OpenGL/Vulkan/Direct3D zones. The \emph{GPU zones} drop-down allows disabling individual GPU contexts and setting CPU/GPU drift offsets (see section~\ref{gpuprofiling} for more information). The \emph{\faRobot~Auto} button automatically measures the GPU drift value\footnote{There is an assumption that drift is linear. Automated measurement calculates and removes change over time in delay-to-execution of GPU zones. Resulting value may still be incorrect.}. \item \emph{\faMicrochip{} Draw CPU zones} -- Determines whether CPU zones are displayed. \begin{itemize} \item \emph{\faGhost{} Draw ghost zones} -- Controls if ghost zones should be displayed in threads which don't have any instrumented zones available. From c70922f3dbfa21d1987c5bef12b05ee34c86b867 Mon Sep 17 00:00:00 2001 From: Andrew Depke Date: Sun, 7 Jun 2020 04:55:20 -0600 Subject: [PATCH 06/11] Work on nested zones support --- TracyD3D12.hpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/TracyD3D12.hpp b/TracyD3D12.hpp index 10cb8a1e..b0436af9 100644 --- a/TracyD3D12.hpp +++ b/TracyD3D12.hpp @@ -39,7 +39,7 @@ namespace tracy { friend class D3D12ZoneScope; - static constexpr uint32_t MaxQueries = 64 * 1024; // Queries are begin and end markers, so we can store half as many total time durations. + static constexpr uint32_t MaxQueries = 64 * 1024; // Queries are begin and end markers, so we can store half as many total time durations. Must be even! bool m_initialized = false; @@ -206,7 +206,7 @@ namespace tracy assert(m_queryCounter < m_queryLimit && "Submitted too many GPU queries! Consider increasing MaxQueries."); const uint32_t id = (m_previousQueryCounter + m_queryCounter) % m_queryLimit; - ++m_queryCounter; + m_queryCounter += 2; // Allocate space for a begin and end query. return id; } @@ -222,6 +222,7 @@ namespace tracy const bool m_active; D3D12QueueCtx* m_ctx = nullptr; ID3D12GraphicsCommandList* m_cmdList = nullptr; + uint32_t m_queryId = 0; // Used for tracking in nested zones. public: tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, bool active) @@ -236,8 +237,8 @@ namespace tracy m_ctx = ctx; m_cmdList = cmdList; - const auto queryId = ctx->NextQueryId(); - cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, queryId); + m_queryId = ctx->NextQueryId(); + cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId); auto* item = Profiler::QueueSerial(); #if defined(TRACY_HAS_CALLSTACK) && defined(TRACY_CALLSTACK) @@ -248,7 +249,7 @@ namespace tracy MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); MemWrite(&item->gpuZoneBegin.srcloc, reinterpret_cast(srcLocation)); MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); - MemWrite(&item->gpuZoneBegin.queryId, static_cast(queryId)); + MemWrite(&item->gpuZoneBegin.queryId, static_cast(m_queryId)); MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); Profiler::QueueSerialFinish(); @@ -262,7 +263,7 @@ namespace tracy { if (!m_active) return; - const auto queryId = m_ctx->NextQueryId(); + const auto queryId = m_queryId + 1; // Our end query slot is immediately after the begin slot. m_cmdList->EndQuery(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, queryId); auto* item = Profiler::QueueSerial(); @@ -274,7 +275,7 @@ namespace tracy Profiler::QueueSerialFinish(); - m_cmdList->ResolveQueryData(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, queryId - 1, 2, m_ctx->m_readbackBuffer.Get(), (queryId - 1) * sizeof(uint64_t)); + m_cmdList->ResolveQueryData(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId, 2, m_ctx->m_readbackBuffer.Get(), m_queryId * sizeof(uint64_t)); } }; From c8bfa43f22fd37184c10bb8377922cf1bea0b8fc Mon Sep 17 00:00:00 2001 From: Andrew Depke Date: Mon, 8 Jun 2020 04:02:54 -0600 Subject: [PATCH 07/11] Added query data sorting to support out-of-order execution --- TracyD3D12.hpp | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/TracyD3D12.hpp b/TracyD3D12.hpp index b0436af9..39e62cc5 100644 --- a/TracyD3D12.hpp +++ b/TracyD3D12.hpp @@ -30,6 +30,8 @@ using TracyD3D12Ctx = void*; #include #include #include +#include +#include namespace tracy { @@ -175,9 +177,31 @@ namespace tracy auto* timestampData = static_cast(readbackBufferMapping); - for (uint32_t index = 0; index < m_queryCounter; ++index) + // First off we need to sort our query data. Without this, out-of-order command list execution (with respect to CPU timeline recording) + // would cause view artifacts in the viewer (zones disappear, take up the whole timeline, etc.) + + std::vector queryData; + queryData.resize(m_queryCounter); + + if (m_previousQueryCounter + m_queryCounter <= m_queryLimit) // Make sure we don't need to loop over. { - const auto timestamp = timestampData[(m_previousQueryCounter + index) % m_queryLimit]; + std::copy(timestampData + m_previousQueryCounter, timestampData + m_previousQueryCounter + m_queryCounter, queryData.begin()); + } + + else + { + const auto firstBatch = (m_previousQueryCounter + m_queryCounter) - m_queryLimit; + std::copy(timestampData + m_previousQueryCounter, timestampData + m_queryLimit, queryData.begin()); + std::copy(timestampData, timestampData + (m_queryCounter - firstBatch), std::next(queryData.begin(), m_queryCounter - firstBatch)); + } + + std::sort(queryData.begin(), queryData.end(), std::less{}); + + // Data is sorted, send it to the profiler. + + for (uint32_t index = 0; index < queryData.size(); ++index) + { + const auto timestamp = queryData[index]; const auto queryId = m_previousQueryCounter + index; auto* item = Profiler::QueueSerial(); From 6e03bb1c2c5038827423ba03a13c45174585c0cb Mon Sep 17 00:00:00 2001 From: Andrew Depke Date: Mon, 8 Jun 2020 16:24:20 -0600 Subject: [PATCH 08/11] Reverted out-of-order execution sorting --- TracyD3D12.hpp | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/TracyD3D12.hpp b/TracyD3D12.hpp index 39e62cc5..b0436af9 100644 --- a/TracyD3D12.hpp +++ b/TracyD3D12.hpp @@ -30,8 +30,6 @@ using TracyD3D12Ctx = void*; #include #include #include -#include -#include namespace tracy { @@ -177,31 +175,9 @@ namespace tracy auto* timestampData = static_cast(readbackBufferMapping); - // First off we need to sort our query data. Without this, out-of-order command list execution (with respect to CPU timeline recording) - // would cause view artifacts in the viewer (zones disappear, take up the whole timeline, etc.) - - std::vector queryData; - queryData.resize(m_queryCounter); - - if (m_previousQueryCounter + m_queryCounter <= m_queryLimit) // Make sure we don't need to loop over. + for (uint32_t index = 0; index < m_queryCounter; ++index) { - std::copy(timestampData + m_previousQueryCounter, timestampData + m_previousQueryCounter + m_queryCounter, queryData.begin()); - } - - else - { - const auto firstBatch = (m_previousQueryCounter + m_queryCounter) - m_queryLimit; - std::copy(timestampData + m_previousQueryCounter, timestampData + m_queryLimit, queryData.begin()); - std::copy(timestampData, timestampData + (m_queryCounter - firstBatch), std::next(queryData.begin(), m_queryCounter - firstBatch)); - } - - std::sort(queryData.begin(), queryData.end(), std::less{}); - - // Data is sorted, send it to the profiler. - - for (uint32_t index = 0; index < queryData.size(); ++index) - { - const auto timestamp = queryData[index]; + const auto timestamp = timestampData[(m_previousQueryCounter + index) % m_queryLimit]; const auto queryId = m_previousQueryCounter + index; auto* item = Profiler::QueueSerial(); From 94732725129bc91719da5e10c0dbc29831dbfb5d Mon Sep 17 00:00:00 2001 From: Andrew Depke Date: Mon, 8 Jun 2020 16:57:31 -0600 Subject: [PATCH 09/11] Fixed queryId not looping back --- TracyD3D12.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TracyD3D12.hpp b/TracyD3D12.hpp index b0436af9..805fdc5a 100644 --- a/TracyD3D12.hpp +++ b/TracyD3D12.hpp @@ -178,7 +178,7 @@ namespace tracy for (uint32_t index = 0; index < m_queryCounter; ++index) { const auto timestamp = timestampData[(m_previousQueryCounter + index) % m_queryLimit]; - const auto queryId = m_previousQueryCounter + index; + const auto queryId = (m_previousQueryCounter + index) % m_queryLimit; auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuTime); From 501b356b2ba5e8d5515e4c8f394f551e33b6ed99 Mon Sep 17 00:00:00 2001 From: Andrew Depke Date: Mon, 8 Jun 2020 22:57:27 -0600 Subject: [PATCH 10/11] Added semi-automatic query synchronization for N-buffered rendering --- TracyD3D12.hpp | 81 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 22 deletions(-) diff --git a/TracyD3D12.hpp b/TracyD3D12.hpp index 805fdc5a..8f75d2ea 100644 --- a/TracyD3D12.hpp +++ b/TracyD3D12.hpp @@ -30,10 +30,17 @@ using TracyD3D12Ctx = void*; #include #include #include +#include namespace tracy { + struct D3D12QueryPayload + { + uint32_t m_queryIdStart = 0; + uint32_t m_queryCount = 0; + }; + // Command queue context. class D3D12QueueCtx { @@ -44,17 +51,24 @@ namespace tracy bool m_initialized = false; ID3D12Device* m_device; + ID3D12CommandQueue* m_queue; uint8_t m_context; Microsoft::WRL::ComPtr m_queryHeap; Microsoft::WRL::ComPtr m_readbackBuffer; + // In-progress payload. uint32_t m_queryLimit = MaxQueries; uint32_t m_queryCounter = 0; uint32_t m_previousQueryCounter = 0; + uint32_t m_activePayload = 0; + Microsoft::WRL::ComPtr m_payloadFence; + std::queue m_payloadQueue; + public: D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue) : m_device(device) + , m_queue(queue) , m_context(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed)) { // Verify we support timestamp queries on this queue. @@ -124,6 +138,11 @@ namespace tracy assert(false && "Failed to create query readback buffer."); } + if (FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_payloadFence)))) + { + assert(false && "Failed to create payload fence."); + } + auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuNewContext); MemWrite(&item->gpuNewContext.cpuTime, cpuTimestamp); @@ -143,15 +162,24 @@ namespace tracy m_initialized = true; } - ~D3D12QueueCtx() {} + void NewFrame() + { + m_payloadQueue.emplace(D3D12QueryPayload{ m_previousQueryCounter, m_queryCounter }); + m_previousQueryCounter += m_queryCounter; + m_queryCounter = 0; + + if (m_previousQueryCounter >= m_queryLimit) + { + m_previousQueryCounter -= m_queryLimit; + } + + m_queue->Signal(m_payloadFence.Get(), ++m_activePayload); + } void Collect() { ZoneScopedC(Color::Red4); - // Check to see if we have any new queries. - if (m_queryCounter == m_previousQueryCounter) return; - #ifdef TRACY_ON_DEMAND if (!GetProfiler().IsConnected()) { @@ -161,7 +189,14 @@ namespace tracy } #endif - // Batch submit all of our query data to the profiler. + // Find out what payloads are available. + const auto newestReadyPayload = m_payloadFence->GetCompletedValue(); + const auto payloadCount = m_payloadQueue.size() - (m_activePayload - newestReadyPayload); + + if (!payloadCount) + { + return; // No payloads are available yet, exit out. + } D3D12_RANGE mapRange{ 0, m_queryLimit * sizeof(uint64_t) }; @@ -175,29 +210,29 @@ namespace tracy auto* timestampData = static_cast(readbackBufferMapping); - for (uint32_t index = 0; index < m_queryCounter; ++index) + for (uint32_t i = 0; i < payloadCount; ++i) { - const auto timestamp = timestampData[(m_previousQueryCounter + index) % m_queryLimit]; - const auto queryId = (m_previousQueryCounter + index) % m_queryLimit; + const auto& payload = m_payloadQueue.front(); - auto* item = Profiler::QueueSerial(); - MemWrite(&item->hdr.type, QueueType::GpuTime); - MemWrite(&item->gpuTime.gpuTime, timestamp); - MemWrite(&item->gpuTime.queryId, static_cast(queryId)); - MemWrite(&item->gpuTime.context, m_context); + for (uint32_t j = 0; j < payload.m_queryCount; ++j) + { + const auto counter = (payload.m_queryIdStart + j) % m_queryLimit; + const auto timestamp = timestampData[counter]; + const auto queryId = counter; - Profiler::QueueSerialFinish(); + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, timestamp); + MemWrite(&item->gpuTime.queryId, static_cast(queryId)); + MemWrite(&item->gpuTime.context, m_context); + + Profiler::QueueSerialFinish(); + } + + m_payloadQueue.pop(); } m_readbackBuffer->Unmap(0, nullptr); - - m_previousQueryCounter += m_queryCounter; - m_queryCounter = 0; - - if (m_previousQueryCounter >= m_queryLimit) - { - m_previousQueryCounter -= m_queryLimit; - } } private: @@ -302,6 +337,8 @@ using TracyD3D12Ctx = tracy::D3D12QueueCtx*; #define TracyD3D12Context(device, queue) tracy::CreateD3D12Context(device, queue); #define TracyD3D12Destroy(ctx) tracy::DestroyD3D12Context(ctx); +#define TracyD3D12NewFrame(ctx) ctx->NewFrame(); + #define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, __LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, __LINE__), active }; #define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, __LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, __LINE__), active }; #define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZone(ctx, ___tracy_gpu_zone, cmdList, name, true) From 7127e3621730edb2a183fa7c48b5041ba920769e Mon Sep 17 00:00:00 2001 From: Andrew Depke Date: Mon, 8 Jun 2020 23:40:16 -0600 Subject: [PATCH 11/11] Detailed TracyD3D12NewFrame and synchronization --- manual/tracy.tex | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manual/tracy.tex b/manual/tracy.tex index 8bda9138..19801be8 100644 --- a/manual/tracy.tex +++ b/manual/tracy.tex @@ -1199,13 +1199,13 @@ You also need to periodically collect the GPU events using the \texttt{TracyVkCo \subsubsection{Direct3D 12} -To enable Direct3D 12 support, include the \texttt{tracy/TracyD3D12.hpp} header file. Tracing Direct3D 12 queues is on par with the Vulkan implementation, where a \texttt{TracyD3D12Ctx} is returned from a call to \texttt{TracyD3D12Context(device, queue)}, which should be later cleaned up with the \texttt{TracyD3D12Destroy(ctx)} macro. Multiple contexts can be created, each with any queue type. +To enable Direct3D 12 support, include the \texttt{tracy/TracyD3D12.hpp} header file. Tracing Direct3D 12 queues is nearly on par with the Vulkan implementation, where a \texttt{TracyD3D12Ctx} is returned from a call to \texttt{TracyD3D12Context(device, queue)}, which should be later cleaned up with the \texttt{TracyD3D12Destroy(ctx)} macro. Multiple contexts can be created, each with any queue type. The queue must have been created through the specified device, however a command list is not needed for this stage. Using GPU zones is the same as the Vulkan implementation, where the \texttt{TracyD3D12Zone(ctx, cmdList, name)} macro is used, with \texttt{name} as a string literal. \texttt{TracyD3D12ZoneC(ctx, cmdList, name, color)} can be used to create a custom-colored zone. The given command list must be in an open state. -Event data can then be collected and sent to the profiler using the \texttt{TracyD3D12Collect(ctx)} macro. +The macro \texttt{TracyD3D12NewFrame(ctx)} is used to mark a new frame, and should appear before or after recording command lists, similar to \texttt{FrameMark}. This macro is a key component that enables automatic query data synchronization, so the user doesn't have to worry about synchronizing GPU execution before invoking a collection. Event data can then be collected and sent to the profiler using the \texttt{TracyD3D12Collect(ctx)} macro. \subsubsection{Multiple zones in one scope}