diff --git a/AUTHORS b/AUTHORS index 02914706..826e3e71 100644 --- a/AUTHORS +++ b/AUTHORS @@ -7,4 +7,5 @@ Till Rathmann (DLL support) Sherief Farouk (compatibility fixes) Dedmen Miller (find zone bug fixes, improvements) Michał Cichoń (OSX call stack decoding backport) +Thales Sabino (OpenCL support) Andrew Depke (Direct3D 12 support) diff --git a/NEWS b/NEWS index eba7e3e4..f8cdb644 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,9 @@ a mistake. v0.7 (xxxx-xx-xx) ----------------- +This is the last release which will be able to load pre-v0.6 traces. Use the +update utility to convert your old traces now! + - chrome:tracing importer now imports zone metadata from "args" key. - Added display of statistical mode to find zone menu. - Automatic stack sampling is now available on windows. @@ -61,6 +64,7 @@ v0.7 (xxxx-xx-xx) - Added ability to send simple integral values as extra payload for zones. - Per-frame zone times on the frames plot can now display self time. - Ability to bind only on localhost interface. +- OpenCL profiling. v0.6.3 (2020-02-13) ------------------- diff --git a/README.md b/README.md index db63cac8..fb07b323 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ [![Sponsor](.github/sponsor.png)](https://github.com/sponsors/wolfpld/) -### A real time, nanosecond resolution, remote telemetry frame profiler for games and other applications. +### A real time, nanosecond resolution, remote telemetry, hybrid frame and sampling profiler for games and other applications. -Tracy supports profiling CPU (C, C++11, Lua), GPU (OpenGL, Vulkan), memory, locks, context switches, per-frame screenshots and more. +Tracy supports profiling CPU (C, C++11, Lua), GPU (OpenGL, Vulkan, OpenCL), memory, locks, context switches, per-frame screenshots and more. For usage **and build process** instructions, consult the user manual [at the following address](https://github.com/wolfpld/tracy/releases). diff --git a/Tracy.hpp b/Tracy.hpp index 7636578e..6425356e 100644 --- a/Tracy.hpp +++ b/Tracy.hpp @@ -150,7 +150,7 @@ # define ZoneScopedS( depth ) ZoneNamedS( ___tracy_scoped_zone, depth, true ) # define ZoneScopedNS( name, depth ) ZoneNamedNS( ___tracy_scoped_zone, name, depth, true ) # define ZoneScopedCS( color, depth ) ZoneNamedCS( ___tracy_scoped_zone, color, depth, true ) -# define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color depth, true ) +# define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color, depth, true ) # define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth ); # define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth ); diff --git a/TracyOpenCL.hpp b/TracyOpenCL.hpp new file mode 100644 index 00000000..77c7de98 --- /dev/null +++ b/TracyOpenCL.hpp @@ -0,0 +1,333 @@ +#ifndef __TRACYOPENCL_HPP__ +#define __TRACYOPENCL_HPP__ + +#if !defined TRACY_ENABLE + +#define TracyCLContext(x, y) nullptr +#define TracyCLDestroy(x) +#define TracyCLNamedZone(c, x, y, z, w) +#define TracyCLNamedZoneC(c, x, y, z, w, a) +#define TracyCLZone(c, x, y) +#define TracyCLZoneC(c, x, y, z) +#define TracyCLCollect(c) + +#define TracyCLNamedZoneS(c, x, y, z, w, a) +#define TracyCLNamedZoneCS(c, x, y, z, w, v, a) +#define TracyCLZoneS(c, x, y, z) +#define TracyCLZoneCS(c, x, y, z, w) + +namespace tracy +{ + class OpenCLCtxScope {}; +} + +using TracyCLCtx = void*; + +#else + +#include + +#include +#include + +#include "Tracy.hpp" +#include "client/TracyCallstack.hpp" +#include "client/TracyProfiler.hpp" +#include "common/TracyAlloc.hpp" + +namespace tracy { + + enum class EventPhase : uint8_t + { + Begin, + End + }; + + struct EventInfo + { + cl_event event; + EventPhase phase; + }; + + class OpenCLCtx + { + public: + enum { QueryCount = 64 * 1024 }; + + OpenCLCtx(cl_context context, cl_device_id device) + : m_contextId(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed)) + , m_head(0) + , m_tail(0) + { + assert(m_contextId != 255); + + m_hostStartTime = Profiler::GetTime(); + m_deviceStartTime = GetDeviceTimestamp(context, device); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuNewContext); + MemWrite(&item->gpuNewContext.cpuTime, m_hostStartTime); + MemWrite(&item->gpuNewContext.gpuTime, m_hostStartTime); + memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); + MemWrite(&item->gpuNewContext.period, 1.0f); + MemWrite(&item->gpuNewContext.type, GpuContextType::OpenCL); + MemWrite(&item->gpuNewContext.context, (uint8_t) m_contextId); + MemWrite(&item->gpuNewContext.accuracyBits, (uint8_t)0); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem(*item); +#endif + Profiler::QueueSerialFinish(); + } + + void Collect() + { + ZoneScopedC(Color::Red4); + + if (m_tail == m_head) return; + +#ifdef TRACY_ON_DEMAND + if (!GetProfiler().IsConnected()) + { + m_head = m_tail = 0; + } +#endif + + while (m_tail != m_head) + { + EventInfo eventInfo = m_query[m_tail]; + cl_event event = eventInfo.event; + cl_int eventStatus; + cl_int err = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr); + assert(err == CL_SUCCESS); + if (eventStatus != CL_COMPLETE) return; + + cl_int eventInfoQuery = (eventInfo.phase == EventPhase::Begin) + ? CL_PROFILING_COMMAND_START + : CL_PROFILING_COMMAND_END; + + cl_ulong eventTimeStamp = 0; + err = clGetEventProfilingInfo(event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr); + assert(err == CL_SUCCESS); + assert(eventTimeStamp != 0); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, TimestampOffset(eventTimeStamp)); + MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail); + MemWrite(&item->gpuTime.context, m_contextId); + Profiler::QueueSerialFinish(); + + if (eventInfo.phase == EventPhase::End) + { + // Done with the event, so release it + assert(clReleaseEvent(event) == CL_SUCCESS); + } + + m_tail = (m_tail + 1) % QueryCount; + } + } + + tracy_force_inline uint8_t GetId() const + { + return m_contextId; + } + + tracy_force_inline unsigned int NextQueryId(EventInfo eventInfo) + { + const auto id = m_head; + m_head = (m_head + 1) % QueryCount; + assert(m_head != m_tail); + m_query[id] = eventInfo; + return id; + } + + tracy_force_inline EventInfo& GetQuery(unsigned int id) + { + assert(id < QueryCount); + return m_query[id]; + } + + private: + tracy_force_inline int64_t GetHostStartTime() const + { + return m_hostStartTime; + } + + tracy_force_inline int64_t GetDeviceStartTime() const + { + return m_deviceStartTime; + } + + tracy_force_inline int64_t TimestampOffset(int64_t deviceTimestamp) const + { + return m_hostStartTime + (deviceTimestamp - m_deviceStartTime); + } + + tracy_force_inline int64_t GetDeviceTimestamp(cl_context context, cl_device_id device) const + { + cl_ulong deviceTimestamp = 0; + cl_int err = CL_SUCCESS; + cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); + assert(err == CL_SUCCESS); + uint32_t dummyValue = 42; + cl_mem dummyBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(uint32_t), nullptr, &err); + assert(err == CL_SUCCESS); + cl_event writeBufferEvent; + err = clEnqueueWriteBuffer(queue, dummyBuffer, CL_TRUE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent); + assert(err == CL_SUCCESS); + err = clWaitForEvents(1, &writeBufferEvent); + assert(err == CL_SUCCESS); + cl_int eventStatus; + err = clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr); + assert(err == CL_SUCCESS); + assert(eventStatus == CL_COMPLETE); + err = clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &deviceTimestamp, nullptr); + assert(err == CL_SUCCESS); + err = clReleaseEvent(writeBufferEvent); + assert(err == CL_SUCCESS); + err = clReleaseMemObject(dummyBuffer); + assert(err == CL_SUCCESS); + err = clReleaseCommandQueue(queue); + assert(err == CL_SUCCESS); + + return (int64_t)deviceTimestamp; + } + + unsigned int m_contextId; + + EventInfo m_query[QueryCount]; + unsigned int m_head; + unsigned int m_tail; + + int64_t m_hostStartTime; + int64_t m_deviceStartTime; + }; + + class OpenCLCtxScope { + public: + tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, bool is_active) +#ifdef TRACY_ON_DEMAND + : m_active(is_active&& GetProfiler().IsConnected()) +#else + : m_active(is_active) +#endif + , m_ctx(ctx) + , m_event(nullptr) + { + if (!m_active) return; + + m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin }); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial); + MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc); + MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId); + MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, int depth, bool is_active) +#ifdef TRACY_ON_DEMAND + : m_active(is_active&& GetProfiler().IsConnected()) +#else + : m_active(is_active) +#endif + , m_ctx(ctx) + , m_event(nullptr) + { + if (!m_active) return; + + m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin }); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial); + MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc); + MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId); + MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); + Profiler::QueueSerialFinish(); + + GetProfiler().SendCallstack(depth); + } + + tracy_force_inline void SetEvent(cl_event event) + { + m_event = event; + assert(clRetainEvent(m_event) == CL_SUCCESS); + m_ctx->GetQuery(m_beginQueryId).event = m_event; + } + + tracy_force_inline ~OpenCLCtxScope() + { + const auto queryId = m_ctx->NextQueryId(EventInfo{ m_event, EventPhase::End }); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial); + MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)queryId); + MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId()); + Profiler::QueueSerialFinish(); + } + + const bool m_active; + OpenCLCtx* m_ctx; + cl_event m_event; + unsigned int m_beginQueryId; + }; + + static inline OpenCLCtx* CreateCLContext(cl_context context, cl_device_id device) + { + InitRPMallocThread(); + auto ctx = (OpenCLCtx*)tracy_malloc(sizeof(OpenCLCtx)); + new (ctx) OpenCLCtx(context, device); + return ctx; + } + + static inline void DestroyCLContext(OpenCLCtx* ctx) + { + ctx->~OpenCLCtx(); + tracy_free(ctx); + } + +} // namespace tracy + +using TracyCLCtx = tracy::OpenCLCtx*; + +#define TracyCLContext(context, device) tracy::CreateCLContext(context, device); +#define TracyCLDestroy(ctx) tracy::DestroyCLContext(ctx); +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define TracyCLZone(ctx, name) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, TRACY_CALLSTACK, true) +# define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, TRACY_CALLSTACK, true) +#else +# define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active); +# define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active); +# define TracyCLZone(ctx, name) TracyCLNamedZone(ctx, __tracy_gpu_zone, name, true) +# define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneC(ctx, __tracy_gpu_zone, name, color, true ) +#endif + +#ifdef TRACY_HAS_CALLSTACK +# define TracyCLNamedZoneS(ctx, varname, name, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active); +# define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active); +# define TracyCLZoneS(ctx, name, depth) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, depth, true) +# define TracyCLZoneCS(ctx, name, color, depth) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, depth, true) +#else +#define TracyCLNamedZoneS(ctx, varname, name, depth, active) TracyCLNamedZone(ctx, varname, name, active) +#define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) TracyCLNamedZoneC(ctx, varname, name, color, active) +#define TracyCLZoneS(ctx, name, depth) TracyCLZone(ctx, name) +#define TracyCLZoneCS(ctx, name, color, depth) TracyCLZoneC(ctx, name, color) +#endif + +#define TracyCLNamedZoneSetEvent(varname, event) varname.SetEvent(event) +#define TracyCLZoneSetEvent(event) __tracy_gpu_zone.SetEvent(event) + +#define TracyCLCollect(ctx) ctx->Collect() + +#endif + +#endif diff --git a/client/TracyProfiler.cpp b/client/TracyProfiler.cpp index 105411a5..812eef2e 100644 --- a/client/TracyProfiler.cpp +++ b/client/TracyProfiler.cpp @@ -224,7 +224,7 @@ static int64_t SetupHwTimer() #ifndef TRACY_TIMER_QPC uint32_t regs[4]; CpuId( regs, 0x80000001 ); - if( !( regs[3] & ( 1 << 27 ) ) ) InitFailure( "CPU doesn't support RDTSCP instruction." ); + if( !( regs[3] & ( 1 << 4 ) ) ) InitFailure( "CPU doesn't support RDTSC instruction." ); CpuId( regs, 0x80000007 ); if( !( regs[3] & ( 1 << 8 ) ) ) { diff --git a/common/TracyProtocol.hpp b/common/TracyProtocol.hpp index d30aad0e..14deebc2 100644 --- a/common/TracyProtocol.hpp +++ b/common/TracyProtocol.hpp @@ -9,7 +9,7 @@ namespace tracy constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; } -enum : uint32_t { ProtocolVersion = 33 }; +enum : uint32_t { ProtocolVersion = 34 }; enum : uint32_t { BroadcastVersion = 1 }; using lz4sz_t = uint32_t; diff --git a/common/TracyQueue.hpp b/common/TracyQueue.hpp index 7ddc9ed6..4bd48b54 100644 --- a/common/TracyQueue.hpp +++ b/common/TracyQueue.hpp @@ -264,6 +264,7 @@ enum class GpuContextType : uint8_t Invalid, OpenGl, Vulkan, + OpenCL, Direct3D12 }; diff --git a/examples/OpenCLVectorAdd/CMakeLists.txt b/examples/OpenCLVectorAdd/CMakeLists.txt new file mode 100644 index 00000000..e5b0bfca --- /dev/null +++ b/examples/OpenCLVectorAdd/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.0) + +project(OpenCLVectorAdd) + +find_package(OpenCL REQUIRED) + +add_executable(OpenCLVectorAdd OpenCLVectorAdd.cpp) + +add_library(TracyClient STATIC ../../TracyClient.cpp + ../../TracyOpenCL.hpp) +target_include_directories(TracyClient PUBLIC ../../) +target_compile_definitions(TracyClient PUBLIC TRACY_ENABLE=1) + +target_link_libraries(OpenCLVectorAdd PUBLIC OpenCL::OpenCL TracyClient) diff --git a/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp b/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp new file mode 100644 index 00000000..d499424f --- /dev/null +++ b/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp @@ -0,0 +1,190 @@ +#include +#include +#include +#include +#include + +#include + +#include +#include + +#define CL_ASSERT(err) \ + if((err) != CL_SUCCESS) \ + { \ + std::cerr << "OpenCL Call Returned " << err << std::endl; \ + assert(false); \ + } + +const char kernelSource[] = +" void __kernel vectorAdd(global float* C, global float* A, global float* B, int N) " +" { " +" int i = get_global_id(0); " +" if (i < N) { " +" C[i] = A[i] + B[i]; " +" } " +" } "; + +int main() +{ + cl_platform_id platform; + cl_device_id device; + cl_context context; + cl_command_queue commandQueue; + cl_kernel vectorAddKernel; + cl_program program; + cl_int err; + cl_mem bufferA, bufferB, bufferC; + + TracyCLCtx tracyCLCtx; + + { + ZoneScopedN("OpenCL Init"); + + cl_uint numPlatforms = 0; + CL_ASSERT(clGetPlatformIDs(0, nullptr, &numPlatforms)); + + if (numPlatforms == 0) + { + std::cerr << "Cannot find OpenCL platform to run this application" << std::endl; + return 1; + } + + CL_ASSERT(clGetPlatformIDs(1, &platform, nullptr)); + + size_t platformNameBufferSize = 0; + CL_ASSERT(clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, nullptr, &platformNameBufferSize)); + std::string platformName(platformNameBufferSize, '\0'); + CL_ASSERT(clGetPlatformInfo(platform, CL_PLATFORM_NAME, platformNameBufferSize, &platformName[0], nullptr)); + + std::cout << "OpenCL Platform: " << platformName << std::endl; + + CL_ASSERT(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, nullptr)); + size_t deviceNameBufferSize = 0; + CL_ASSERT(clGetDeviceInfo(device, CL_DEVICE_NAME, 0, nullptr, &deviceNameBufferSize)); + std::string deviceName(deviceNameBufferSize, '\0'); + CL_ASSERT(clGetDeviceInfo(device, CL_DEVICE_NAME, deviceNameBufferSize, &deviceName[0], nullptr)); + + std::cout << "OpenCL Device: " << deviceName << std::endl; + + err = CL_SUCCESS; + context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err); + CL_ASSERT(err); + + size_t kernelSourceLength = sizeof(kernelSource); + const char* kernelSourceArray = { kernelSource }; + program = clCreateProgramWithSource(context, 1, &kernelSourceArray, &kernelSourceLength, &err); + CL_ASSERT(err); + + if (clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr) != CL_SUCCESS) + { + size_t programBuildLogBufferSize = 0; + CL_ASSERT(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, nullptr, &programBuildLogBufferSize)); + std::string programBuildLog(programBuildLogBufferSize, '\0'); + CL_ASSERT(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, programBuildLogBufferSize, &programBuildLog[0], nullptr)); + std::clog << programBuildLog << std::endl; + return 1; + } + + vectorAddKernel = clCreateKernel(program, "vectorAdd", &err); + CL_ASSERT(err); + + commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); + CL_ASSERT(err); + } + + tracyCLCtx = TracyCLContext(context, device); + + size_t N = 10 * 1024 * 1024 / sizeof(float); // 10MB of floats + std::vector hostA, hostB, hostC; + + { + ZoneScopedN("Host Data Init"); + hostA.resize(N); + hostB.resize(N); + hostC.resize(N); + + std::iota(std::begin(hostA), std::end(hostA), 0); + std::iota(std::begin(hostB), std::end(hostB), 0); + } + + { + ZoneScopedN("Host to Device Memory Copy"); + + bufferA = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err); + CL_ASSERT(err); + bufferB = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err); + CL_ASSERT(err); + bufferC = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err); + CL_ASSERT(err); + + cl_event writeBufferAEvent, writeBufferBEvent; + { + ZoneScopedN("Write Buffer A"); + TracyCLZoneS(tracyCLCtx, "Write BufferA", 5); + + CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferA, CL_TRUE, 0, N * sizeof(float), hostA.data(), 0, nullptr, &writeBufferAEvent)); + + TracyCLZoneSetEvent(writeBufferAEvent); + } + { + ZoneScopedN("Write Buffer B"); + TracyCLZone(tracyCLCtx, "Write BufferB"); + + CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferB, CL_TRUE, 0, N * sizeof(float), hostB.data(), 0, nullptr, &writeBufferBEvent)); + + TracyCLZoneSetEvent(writeBufferBEvent); + } + } + + for (int i = 0; i < 10; ++i) + { + ZoneScopedN("VectorAdd Kernel Launch"); + TracyCLZoneC(tracyCLCtx, "VectorAdd Kernel", tracy::Color::Blue4); + + CL_ASSERT(clSetKernelArg(vectorAddKernel, 0, sizeof(cl_mem), &bufferC)); + CL_ASSERT(clSetKernelArg(vectorAddKernel, 1, sizeof(cl_mem), &bufferA)); + CL_ASSERT(clSetKernelArg(vectorAddKernel, 2, sizeof(cl_mem), &bufferB)); + CL_ASSERT(clSetKernelArg(vectorAddKernel, 3, sizeof(int), &static_cast(N))); + + cl_event vectorAddKernelEvent; + CL_ASSERT(clEnqueueNDRangeKernel(commandQueue, vectorAddKernel, 1, nullptr, &N, nullptr, 0, nullptr, &vectorAddKernelEvent)); + + CL_ASSERT(clWaitForEvents(1, &vectorAddKernelEvent)); + + TracyCLZoneSetEvent(vectorAddKernelEvent); + + cl_ulong kernelStartTime, kernelEndTime; + CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelStartTime, nullptr)); + CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, nullptr)); + std::cout << "VectorAdd Kernel Elapsed: " << ((kernelEndTime - kernelStartTime) / 1000) << " us" << std::endl; + } + + { + ZoneScopedN("Device to Host Memory Copy"); + TracyCLZone(tracyCLCtx, "Read Buffer C"); + + cl_event readbufferCEvent; + CL_ASSERT(clEnqueueReadBuffer(commandQueue, bufferC, CL_TRUE, 0, N * sizeof(float), hostC.data(), 0, nullptr, &readbufferCEvent)); + TracyCLZoneSetEvent(readbufferCEvent); + } + + CL_ASSERT(clFinish(commandQueue)); + + TracyCLCollect(tracyCLCtx); + + { + ZoneScopedN("Checking results"); + + for (int i = 0; i < N; ++i) + { + assert(hostC[i] == hostA[i] + hostB[i]); + } + } + + std::cout << "Results are correct!" << std::endl; + + TracyCLDestroy(tracyCLCtx); + + return 0; +} diff --git a/manual/tracy.tex b/manual/tracy.tex index 19801be8..70df46f0 100644 --- a/manual/tracy.tex +++ b/manual/tracy.tex @@ -117,7 +117,7 @@ Hello and welcome to the Tracy Profiler user manual! Here you will find all the \section{A quick look at Tracy Profiler} \label{quicklook} -Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that can be used for remote or embedded telemetry of games and other applications. It can profile CPU (C, C++11, Lua), GPU (OpenGL, Vulkan, Direct3D 12) and memory. It also can monitor locks held by threads and show where contention does happen. +Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that can be used for remote or embedded telemetry of games and other applications. It can profile CPU (C, C++11, Lua), GPU (OpenGL, Vulkan, Direct3D 12, OpenCL) and memory. It also can monitor locks held by threads and show where contention does happen. While Tracy can perform statistical analysis of sampled call stack data, just like other \emph{statistical profilers} (such as VTune, perf or Very Sleepy), it mainly focuses on manual markup of the source code, which allows frame-by-frame inspection of the program execution. You will be able to see exactly which functions are called, how much time is spent in them, and how do they interact with each other in a multi-threaded environment. In contrast, the statistical analysis may show you the hot spots in your code, but it is unable to accurately pinpoint the underlying cause for semi-random frame stutter that may occur every couple of seconds. @@ -452,7 +452,7 @@ The first command will allow access to system CPU statistics. The second one wil \paragraph{Cloud service providers} -In some cases you actually don't own the hardware, but lend it from someone else. In such circumstances you might be running inside a virtual machine, which may be configured to prohibit you from using the bare metal facilities needed by Tracy\footnote{Or you might just be using a quite old CPU, which doesn't have support for required features.}. One example of such limitation would be lack of access to a reliable time stamp register readings, which will prevent the application from starting with either 'CPU doesn't support RDTSCP instruction' or 'CPU doesn't support invariant TSC' error message. If you are using Windows, you may workaround this issue by rebuilding the profiled application with the \texttt{TRACY\_TIMER\_QPC} macro, but be aware that it will severely lower the resolution of timer readings. +In some cases you actually don't own the hardware, but lend it from someone else. In such circumstances you might be running inside a virtual machine, which may be configured to prohibit you from using the bare metal facilities needed by Tracy\footnote{Or you might just be using a quite old CPU, which doesn't have support for required features.}. One example of such limitation would be lack of access to a reliable time stamp register readings, which will prevent the application from starting with either 'CPU doesn't support RDTSC instruction' or 'CPU doesn't support invariant TSC' error message. If you are using Windows, you may workaround this issue by rebuilding the profiled application with the \texttt{TRACY\_TIMER\_QPC} macro, but be aware that it will severely lower the resolution of timer readings. \subsubsection{Changing network port} @@ -691,7 +691,7 @@ Memory & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\ GPU zones (OpenGL) & \faCheck & \faCheck & \faCheck & \faPoo & \faPoo & \\ GPU zones (Vulkan) & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \\ Call stacks & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\ -Symbol resolution & \faCheck & \faCheck & \faCheck & \faCheck & \faPoo & \faCheck \\ +Symbol resolution & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\ Crash handling & \faCheck & \faCheck & \faCheck & \faTimes & \faTimes & \faTimes \\ CPU usage probing & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\ Context switches & \faCheck & \faCheck & \faCheck & \faTimes & \faPoo & \faTimes \\ @@ -1207,6 +1207,16 @@ Using GPU zones is the same as the Vulkan implementation, where the \texttt{Trac The macro \texttt{TracyD3D12NewFrame(ctx)} is used to mark a new frame, and should appear before or after recording command lists, similar to \texttt{FrameMark}. This macro is a key component that enables automatic query data synchronization, so the user doesn't have to worry about synchronizing GPU execution before invoking a collection. Event data can then be collected and sent to the profiler using the \texttt{TracyD3D12Collect(ctx)} macro. +\subsubsection{OpenCL} + +OpenCL support is achieved by including the \texttt{tracy/TracyOpenCL.hpp} header file. Tracing OpenCL requires the creation of a Tracy OpenCL context using the macro \texttt{TracyCLContext(context, device)}, which will return an instance of \texttt{TracyCLCtx} object that must be used when creating zones. The specified \texttt{device} must be part of the \texttt{context}. Cleanup is performed using the \texttt{TracyCLDestroy(ctx)} macro. Although not common, it is possible to create multiple OpenCL contexts for the same application. + +To mark an OpenCL zone one must make sure that a valid OpenCL \texttt{cl\_event} object is available. The event will be the object that Tracy will use to query profiling information from the OpenCL driver. For this to work, all OpenCL queues must be created with the \texttt{CL\_QUEUE\_PROFILING\_ENABLE} property. + +OpenCL zones can be created with the \texttt{TracyCLZone(ctx, name)} where \texttt{name} will usually be a descriptive name for the operation represented by the \texttt{cl\_event}. Within the scope of the zone, you must call \texttt{TracyCLSetEvent(event)} for the event to be registered in Tracy. + +Similarly to Vulkan and OpenGL, you also need to periodically collect the OpenCL events using the \texttt{TracyCLCollect(ctx)} macro. A good place to perform this operation is after a \texttt{clFinish}, since this will ensure that any previous queued OpenCL commands will have finished by this point. + \subsubsection{Multiple zones in one scope} Putting more than one GPU zone macro in a single scope features the same issue as with the \texttt{ZoneScoped} macros, described in section~\ref{multizone} (but this time the variable name is \texttt{\_\_\_tracy\_gpu\_zone}). @@ -1279,6 +1289,15 @@ To have proper call stack information, the profiled application must be compiled \item On MSVC open the project properties and go to \emph{Linker\textrightarrow Debugging\textrightarrow Generate Debug Info}, where the \emph{Generate Debug Information} option should be selected. \item On gcc or clang remember to specify the debugging information \texttt{-g} parameter during compilation and omit the strip symbols \texttt{-s} parameter. Link the executable with an additional option \texttt{-rdynamic} (or \texttt{-{}-export-dynamic}, if you are passing parameters directly to the linker). \item On OSX you may need to run \texttt{dsymutil} to extract the debugging data out of the executable binary. +\item On iOS you will have to add a \emph{New Run Script Phase} to your XCode project, which will execute the following shell script: + +\begin{lstlisting}[language=sh] +cp -rf ${TARGET_BUILD_DIR}/${WRAPPER_NAME}.dSYM/* ${TARGET_BUILD_DIR}/${UNLOCALIZED_RESOURCES_FOLDER_PATH}/${PRODUCT_NAME}.dSYM +\end{lstlisting} + +You will also need to setup proper dependencies, by setting the following input file:\\ +\texttt{\$\{TARGET\_BUILD\_DIR\}/\$\{WRAPPER\_NAME\}.dSYM}, and the following output file:\\ +\texttt{\$\{TARGET\_BUILD\_DIR\}/\$\{UNLOCALIZED\_RESOURCES\_FOLDER\_PATH\}/\$\{PRODUCT\_NAME\}.dSYM}. \end{itemize} You may also be interested in symbols from external libraries, especially if you have sampling profiling enabled (section~\ref{sampling}). In MSVC you can retrieve such symbols by going to \emph{Tools\textrightarrow Options\textrightarrow Debugging\textrightarrow Symbols} and selecting appropriate \emph{Symbol file (.pdb) location} servers. Note that additional symbols may significantly increase application startup times. diff --git a/server/TracySourceView.cpp b/server/TracySourceView.cpp index b9a74c32..74c96397 100644 --- a/server/TracySourceView.cpp +++ b/server/TracySourceView.cpp @@ -436,6 +436,7 @@ void SourceView::ParseSource( const char* fileName, const Worker& worker, const { if( m_file != fileName ) { + m_srcWidth = 0; m_file = fileName; m_fileStringIdx = worker.FindStringIdx( fileName ); m_lines.clear(); @@ -510,6 +511,7 @@ bool SourceView::Disassemble( uint64_t symAddr, const Worker& worker ) m_maxJumpLevel = 0; m_asmSelected = -1; m_asmCountBase = -1; + m_asmWidth = 0; if( symAddr == 0 ) return false; m_cpuArch = worker.GetCpuArch(); if( m_cpuArch == CpuArchUnknown ) return false; @@ -889,6 +891,7 @@ void SourceView::Render( const Worker& worker, View& view ) void SourceView::RenderSimpleSourceView() { + ImGui::SetNextWindowContentSize( ImVec2( m_srcWidth, 0 ) ); ImGui::BeginChild( "##sourceView", ImVec2( 0, 0 ), true, ImGuiWindowFlags_HorizontalScrollbar ); if( m_font ) ImGui::PushFont( m_font ); @@ -915,6 +918,8 @@ void SourceView::RenderSimpleSourceView() } RenderLine( line, lineNum++, 0, 0, 0, nullptr ); } + const auto win = ImGui::GetCurrentWindowRead(); + m_srcWidth = win->DC.CursorMaxPos.x - win->DC.CursorStartPos.x; } else { @@ -1376,11 +1381,12 @@ void SourceView::RenderSymbolSourceView( uint32_t iptotal, unordered_flat_mapScroll.x, 0 ); const auto wh = ImGui::GetWindowHeight(); const auto ty = ImGui::GetFontSize(); const auto ts = ImGui::CalcTextSize( " " ).x; @@ -1410,6 +1416,8 @@ void SourceView::RenderSymbolSourceView( uint32_t iptotal, unordered_flat_mapDC.CursorMaxPos.x - win->DC.CursorStartPos.x; } else { @@ -1435,7 +1443,7 @@ void SourceView::RenderSymbolSourceView( uint32_t iptotal, unordered_flat_mapScrollbarY ) { auto draw = ImGui::GetWindowDrawList(); @@ -1654,6 +1662,7 @@ uint64_t SourceView::RenderSymbolAsmView( uint32_t iptotal, unordered_flat_mapDC.CursorMaxPos.x - win->DC.CursorStartPos.x; } else { @@ -1848,7 +1859,7 @@ uint64_t SourceView::RenderSymbolAsmView( uint32_t iptotal, unordered_flat_mapScrollbarY ) { auto draw = ImGui::GetWindowDrawList(); @@ -2080,7 +2091,7 @@ void SourceView::RenderLine( const Line& line, int lineNum, uint32_t ipcnt, uint { const auto ty = ImGui::GetFontSize(); auto draw = ImGui::GetWindowDrawList(); - const auto w = ImGui::GetWindowWidth(); + const auto w = std::max( m_srcWidth, ImGui::GetWindowWidth() ); const auto wpos = ImGui::GetCursorScreenPos(); if( m_fileStringIdx == m_hoveredSource && lineNum == m_hoveredLine ) { @@ -2257,7 +2268,7 @@ void SourceView::RenderAsmLine( AsmLine& line, uint32_t ipcnt, uint32_t iptotal, { const auto ty = ImGui::GetFontSize(); auto draw = ImGui::GetWindowDrawList(); - const auto w = ImGui::GetWindowWidth(); + const auto w = std::max( m_asmWidth, ImGui::GetWindowWidth() ); const auto wpos = ImGui::GetCursorScreenPos(); if( m_selectedAddressesHover.find( line.addr ) != m_selectedAddressesHover.end() ) { diff --git a/server/TracySourceView.hpp b/server/TracySourceView.hpp index 3fce239b..75b80c54 100644 --- a/server/TracySourceView.hpp +++ b/server/TracySourceView.hpp @@ -239,6 +239,9 @@ private: unordered_flat_set m_srcSampleSelect; uint32_t m_asmGroupSelect = -1; uint32_t m_srcGroupSelect = -1; + + float m_srcWidth; + float m_asmWidth; }; } diff --git a/server/TracyVersion.hpp b/server/TracyVersion.hpp index d040147a..eb2542f9 100644 --- a/server/TracyVersion.hpp +++ b/server/TracyVersion.hpp @@ -7,7 +7,7 @@ namespace Version { enum { Major = 0 }; enum { Minor = 6 }; -enum { Patch = 14 }; +enum { Patch = 15 }; } } diff --git a/server/TracyView.cpp b/server/TracyView.cpp index fbf14ccc..2a725c4a 100644 --- a/server/TracyView.cpp +++ b/server/TracyView.cpp @@ -76,6 +76,7 @@ constexpr const char* GpuContextNames[] = { "Invalid", "OpenGL", "Vulkan", + "OpenCL", "Direct3D 12" }; @@ -1293,7 +1294,7 @@ void View::DrawFrames() enum { MaxFrameTime = 50 * 1000 * 1000 }; // 50ms - ImGuiWindow* window = ImGui::GetCurrentWindow(); + ImGuiWindow* window = ImGui::GetCurrentWindowRead(); if( window->SkipItems ) return; auto& io = ImGui::GetIO(); @@ -2300,7 +2301,7 @@ void View::DrawZones() if( m_vd.zvStart == m_vd.zvEnd ) return; assert( m_vd.zvStart < m_vd.zvEnd ); - if( ImGui::GetCurrentWindow()->SkipItems ) return; + if( ImGui::GetCurrentWindowRead()->SkipItems ) return; m_gpuThread = 0; m_gpuStart = 0; @@ -2473,7 +2474,8 @@ void View::DrawZones() draw->AddTriangle( wpos + ImVec2( to/2, oldOffset + to/2 ), wpos + ImVec2( to/2, oldOffset + ty - to/2 ), wpos + ImVec2( to/2 + th, oldOffset + ty * 0.5 ), 0xFF886666, 2.0f ); } - const bool isMultithreaded = (v->type == GpuContextType::Vulkan || v->type == GpuContextType::Direct3D12); + const bool isMultithreaded = (v->type == GpuContextType::Vulkan) || (v->type == GpuContextType::OpenCL) || (v->type == GpuContextType::Direct3D12); + char buf[64]; sprintf( buf, "%s context %zu", GpuContextNames[(int)v->type], i ); DrawTextContrast( draw, wpos + ImVec2( ty, oldOffset ), showFull ? 0xFFFFAAAA : 0xFF886666, buf );