From a46f83364e71bb2ae7e9b54d3029858cd8c8f815 Mon Sep 17 00:00:00 2001 From: Thales Sabino Date: Wed, 27 May 2020 16:26:52 +0100 Subject: [PATCH 01/13] Add OpenCL trace support - Adds the file TracyOpenCL.hpp which contains the API to annotate OpenCL applications - It works in a similar fashion to the Vulkan annotations - Adds an example OpenCL application in examples/OpenCLVectorAdd - Adds "OpenCL Context" to the UI - Manual entry for annotating OpenCL zones --- AUTHORS | 1 + TracyOpenCL.hpp | 333 +++++++++++++++++++ common/TracyQueue.hpp | 3 +- examples/OpenCLVectorAdd/CMakeLists.txt | 14 + examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp | 190 +++++++++++ manual/tracy.tex | 10 + server/TracyView.cpp | 5 +- 7 files changed, 553 insertions(+), 3 deletions(-) create mode 100644 TracyOpenCL.hpp create mode 100644 examples/OpenCLVectorAdd/CMakeLists.txt create mode 100644 examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp diff --git a/AUTHORS b/AUTHORS index 2719cf57..ca3b476e 100644 --- a/AUTHORS +++ b/AUTHORS @@ -7,3 +7,4 @@ Till Rathmann (DLL support) Sherief Farouk (compatibility fixes) Dedmen Miller (find zone bug fixes, improvements) Michał Cichoń (OSX call stack decoding backport) +Thales Sabino (OpenCL support) diff --git a/TracyOpenCL.hpp b/TracyOpenCL.hpp new file mode 100644 index 00000000..77c7de98 --- /dev/null +++ b/TracyOpenCL.hpp @@ -0,0 +1,333 @@ +#ifndef __TRACYOPENCL_HPP__ +#define __TRACYOPENCL_HPP__ + +#if !defined TRACY_ENABLE + +#define TracyCLContext(x, y) nullptr +#define TracyCLDestroy(x) +#define TracyCLNamedZone(c, x, y, z, w) +#define TracyCLNamedZoneC(c, x, y, z, w, a) +#define TracyCLZone(c, x, y) +#define TracyCLZoneC(c, x, y, z) +#define TracyCLCollect(c) + +#define TracyCLNamedZoneS(c, x, y, z, w, a) +#define TracyCLNamedZoneCS(c, x, y, z, w, v, a) +#define TracyCLZoneS(c, x, y, z) +#define TracyCLZoneCS(c, x, y, z, w) + +namespace tracy +{ + class OpenCLCtxScope {}; +} + +using TracyCLCtx = void*; + +#else + +#include + +#include +#include + +#include "Tracy.hpp" +#include "client/TracyCallstack.hpp" +#include "client/TracyProfiler.hpp" +#include "common/TracyAlloc.hpp" + +namespace tracy { + + enum class EventPhase : uint8_t + { + Begin, + End + }; + + struct EventInfo + { + cl_event event; + EventPhase phase; + }; + + class OpenCLCtx + { + public: + enum { QueryCount = 64 * 1024 }; + + OpenCLCtx(cl_context context, cl_device_id device) + : m_contextId(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed)) + , m_head(0) + , m_tail(0) + { + assert(m_contextId != 255); + + m_hostStartTime = Profiler::GetTime(); + m_deviceStartTime = GetDeviceTimestamp(context, device); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuNewContext); + MemWrite(&item->gpuNewContext.cpuTime, m_hostStartTime); + MemWrite(&item->gpuNewContext.gpuTime, m_hostStartTime); + memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); + MemWrite(&item->gpuNewContext.period, 1.0f); + MemWrite(&item->gpuNewContext.type, GpuContextType::OpenCL); + MemWrite(&item->gpuNewContext.context, (uint8_t) m_contextId); + MemWrite(&item->gpuNewContext.accuracyBits, (uint8_t)0); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem(*item); +#endif + Profiler::QueueSerialFinish(); + } + + void Collect() + { + ZoneScopedC(Color::Red4); + + if (m_tail == m_head) return; + +#ifdef TRACY_ON_DEMAND + if (!GetProfiler().IsConnected()) + { + m_head = m_tail = 0; + } +#endif + + while (m_tail != m_head) + { + EventInfo eventInfo = m_query[m_tail]; + cl_event event = eventInfo.event; + cl_int eventStatus; + cl_int err = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr); + assert(err == CL_SUCCESS); + if (eventStatus != CL_COMPLETE) return; + + cl_int eventInfoQuery = (eventInfo.phase == EventPhase::Begin) + ? CL_PROFILING_COMMAND_START + : CL_PROFILING_COMMAND_END; + + cl_ulong eventTimeStamp = 0; + err = clGetEventProfilingInfo(event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr); + assert(err == CL_SUCCESS); + assert(eventTimeStamp != 0); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, TimestampOffset(eventTimeStamp)); + MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail); + MemWrite(&item->gpuTime.context, m_contextId); + Profiler::QueueSerialFinish(); + + if (eventInfo.phase == EventPhase::End) + { + // Done with the event, so release it + assert(clReleaseEvent(event) == CL_SUCCESS); + } + + m_tail = (m_tail + 1) % QueryCount; + } + } + + tracy_force_inline uint8_t GetId() const + { + return m_contextId; + } + + tracy_force_inline unsigned int NextQueryId(EventInfo eventInfo) + { + const auto id = m_head; + m_head = (m_head + 1) % QueryCount; + assert(m_head != m_tail); + m_query[id] = eventInfo; + return id; + } + + tracy_force_inline EventInfo& GetQuery(unsigned int id) + { + assert(id < QueryCount); + return m_query[id]; + } + + private: + tracy_force_inline int64_t GetHostStartTime() const + { + return m_hostStartTime; + } + + tracy_force_inline int64_t GetDeviceStartTime() const + { + return m_deviceStartTime; + } + + tracy_force_inline int64_t TimestampOffset(int64_t deviceTimestamp) const + { + return m_hostStartTime + (deviceTimestamp - m_deviceStartTime); + } + + tracy_force_inline int64_t GetDeviceTimestamp(cl_context context, cl_device_id device) const + { + cl_ulong deviceTimestamp = 0; + cl_int err = CL_SUCCESS; + cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); + assert(err == CL_SUCCESS); + uint32_t dummyValue = 42; + cl_mem dummyBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(uint32_t), nullptr, &err); + assert(err == CL_SUCCESS); + cl_event writeBufferEvent; + err = clEnqueueWriteBuffer(queue, dummyBuffer, CL_TRUE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent); + assert(err == CL_SUCCESS); + err = clWaitForEvents(1, &writeBufferEvent); + assert(err == CL_SUCCESS); + cl_int eventStatus; + err = clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr); + assert(err == CL_SUCCESS); + assert(eventStatus == CL_COMPLETE); + err = clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &deviceTimestamp, nullptr); + assert(err == CL_SUCCESS); + err = clReleaseEvent(writeBufferEvent); + assert(err == CL_SUCCESS); + err = clReleaseMemObject(dummyBuffer); + assert(err == CL_SUCCESS); + err = clReleaseCommandQueue(queue); + assert(err == CL_SUCCESS); + + return (int64_t)deviceTimestamp; + } + + unsigned int m_contextId; + + EventInfo m_query[QueryCount]; + unsigned int m_head; + unsigned int m_tail; + + int64_t m_hostStartTime; + int64_t m_deviceStartTime; + }; + + class OpenCLCtxScope { + public: + tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, bool is_active) +#ifdef TRACY_ON_DEMAND + : m_active(is_active&& GetProfiler().IsConnected()) +#else + : m_active(is_active) +#endif + , m_ctx(ctx) + , m_event(nullptr) + { + if (!m_active) return; + + m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin }); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial); + MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc); + MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId); + MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, int depth, bool is_active) +#ifdef TRACY_ON_DEMAND + : m_active(is_active&& GetProfiler().IsConnected()) +#else + : m_active(is_active) +#endif + , m_ctx(ctx) + , m_event(nullptr) + { + if (!m_active) return; + + m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin }); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial); + MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc); + MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId); + MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); + Profiler::QueueSerialFinish(); + + GetProfiler().SendCallstack(depth); + } + + tracy_force_inline void SetEvent(cl_event event) + { + m_event = event; + assert(clRetainEvent(m_event) == CL_SUCCESS); + m_ctx->GetQuery(m_beginQueryId).event = m_event; + } + + tracy_force_inline ~OpenCLCtxScope() + { + const auto queryId = m_ctx->NextQueryId(EventInfo{ m_event, EventPhase::End }); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial); + MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)queryId); + MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId()); + Profiler::QueueSerialFinish(); + } + + const bool m_active; + OpenCLCtx* m_ctx; + cl_event m_event; + unsigned int m_beginQueryId; + }; + + static inline OpenCLCtx* CreateCLContext(cl_context context, cl_device_id device) + { + InitRPMallocThread(); + auto ctx = (OpenCLCtx*)tracy_malloc(sizeof(OpenCLCtx)); + new (ctx) OpenCLCtx(context, device); + return ctx; + } + + static inline void DestroyCLContext(OpenCLCtx* ctx) + { + ctx->~OpenCLCtx(); + tracy_free(ctx); + } + +} // namespace tracy + +using TracyCLCtx = tracy::OpenCLCtx*; + +#define TracyCLContext(context, device) tracy::CreateCLContext(context, device); +#define TracyCLDestroy(ctx) tracy::DestroyCLContext(ctx); +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define TracyCLZone(ctx, name) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, TRACY_CALLSTACK, true) +# define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, TRACY_CALLSTACK, true) +#else +# define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active); +# define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active); +# define TracyCLZone(ctx, name) TracyCLNamedZone(ctx, __tracy_gpu_zone, name, true) +# define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneC(ctx, __tracy_gpu_zone, name, color, true ) +#endif + +#ifdef TRACY_HAS_CALLSTACK +# define TracyCLNamedZoneS(ctx, varname, name, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active); +# define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active); +# define TracyCLZoneS(ctx, name, depth) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, depth, true) +# define TracyCLZoneCS(ctx, name, color, depth) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, depth, true) +#else +#define TracyCLNamedZoneS(ctx, varname, name, depth, active) TracyCLNamedZone(ctx, varname, name, active) +#define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) TracyCLNamedZoneC(ctx, varname, name, color, active) +#define TracyCLZoneS(ctx, name, depth) TracyCLZone(ctx, name) +#define TracyCLZoneCS(ctx, name, color, depth) TracyCLZoneC(ctx, name, color) +#endif + +#define TracyCLNamedZoneSetEvent(varname, event) varname.SetEvent(event) +#define TracyCLZoneSetEvent(event) __tracy_gpu_zone.SetEvent(event) + +#define TracyCLCollect(ctx) ctx->Collect() + +#endif + +#endif diff --git a/common/TracyQueue.hpp b/common/TracyQueue.hpp index f791410b..9257f5a6 100644 --- a/common/TracyQueue.hpp +++ b/common/TracyQueue.hpp @@ -263,7 +263,8 @@ enum class GpuContextType : uint8_t { Invalid, OpenGl, - Vulkan + Vulkan, + OpenCL }; struct QueueGpuNewContext diff --git a/examples/OpenCLVectorAdd/CMakeLists.txt b/examples/OpenCLVectorAdd/CMakeLists.txt new file mode 100644 index 00000000..e5b0bfca --- /dev/null +++ b/examples/OpenCLVectorAdd/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.0) + +project(OpenCLVectorAdd) + +find_package(OpenCL REQUIRED) + +add_executable(OpenCLVectorAdd OpenCLVectorAdd.cpp) + +add_library(TracyClient STATIC ../../TracyClient.cpp + ../../TracyOpenCL.hpp) +target_include_directories(TracyClient PUBLIC ../../) +target_compile_definitions(TracyClient PUBLIC TRACY_ENABLE=1) + +target_link_libraries(OpenCLVectorAdd PUBLIC OpenCL::OpenCL TracyClient) diff --git a/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp b/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp new file mode 100644 index 00000000..d499424f --- /dev/null +++ b/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp @@ -0,0 +1,190 @@ +#include +#include +#include +#include +#include + +#include + +#include +#include + +#define CL_ASSERT(err) \ + if((err) != CL_SUCCESS) \ + { \ + std::cerr << "OpenCL Call Returned " << err << std::endl; \ + assert(false); \ + } + +const char kernelSource[] = +" void __kernel vectorAdd(global float* C, global float* A, global float* B, int N) " +" { " +" int i = get_global_id(0); " +" if (i < N) { " +" C[i] = A[i] + B[i]; " +" } " +" } "; + +int main() +{ + cl_platform_id platform; + cl_device_id device; + cl_context context; + cl_command_queue commandQueue; + cl_kernel vectorAddKernel; + cl_program program; + cl_int err; + cl_mem bufferA, bufferB, bufferC; + + TracyCLCtx tracyCLCtx; + + { + ZoneScopedN("OpenCL Init"); + + cl_uint numPlatforms = 0; + CL_ASSERT(clGetPlatformIDs(0, nullptr, &numPlatforms)); + + if (numPlatforms == 0) + { + std::cerr << "Cannot find OpenCL platform to run this application" << std::endl; + return 1; + } + + CL_ASSERT(clGetPlatformIDs(1, &platform, nullptr)); + + size_t platformNameBufferSize = 0; + CL_ASSERT(clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, nullptr, &platformNameBufferSize)); + std::string platformName(platformNameBufferSize, '\0'); + CL_ASSERT(clGetPlatformInfo(platform, CL_PLATFORM_NAME, platformNameBufferSize, &platformName[0], nullptr)); + + std::cout << "OpenCL Platform: " << platformName << std::endl; + + CL_ASSERT(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, nullptr)); + size_t deviceNameBufferSize = 0; + CL_ASSERT(clGetDeviceInfo(device, CL_DEVICE_NAME, 0, nullptr, &deviceNameBufferSize)); + std::string deviceName(deviceNameBufferSize, '\0'); + CL_ASSERT(clGetDeviceInfo(device, CL_DEVICE_NAME, deviceNameBufferSize, &deviceName[0], nullptr)); + + std::cout << "OpenCL Device: " << deviceName << std::endl; + + err = CL_SUCCESS; + context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err); + CL_ASSERT(err); + + size_t kernelSourceLength = sizeof(kernelSource); + const char* kernelSourceArray = { kernelSource }; + program = clCreateProgramWithSource(context, 1, &kernelSourceArray, &kernelSourceLength, &err); + CL_ASSERT(err); + + if (clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr) != CL_SUCCESS) + { + size_t programBuildLogBufferSize = 0; + CL_ASSERT(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, nullptr, &programBuildLogBufferSize)); + std::string programBuildLog(programBuildLogBufferSize, '\0'); + CL_ASSERT(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, programBuildLogBufferSize, &programBuildLog[0], nullptr)); + std::clog << programBuildLog << std::endl; + return 1; + } + + vectorAddKernel = clCreateKernel(program, "vectorAdd", &err); + CL_ASSERT(err); + + commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); + CL_ASSERT(err); + } + + tracyCLCtx = TracyCLContext(context, device); + + size_t N = 10 * 1024 * 1024 / sizeof(float); // 10MB of floats + std::vector hostA, hostB, hostC; + + { + ZoneScopedN("Host Data Init"); + hostA.resize(N); + hostB.resize(N); + hostC.resize(N); + + std::iota(std::begin(hostA), std::end(hostA), 0); + std::iota(std::begin(hostB), std::end(hostB), 0); + } + + { + ZoneScopedN("Host to Device Memory Copy"); + + bufferA = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err); + CL_ASSERT(err); + bufferB = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err); + CL_ASSERT(err); + bufferC = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err); + CL_ASSERT(err); + + cl_event writeBufferAEvent, writeBufferBEvent; + { + ZoneScopedN("Write Buffer A"); + TracyCLZoneS(tracyCLCtx, "Write BufferA", 5); + + CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferA, CL_TRUE, 0, N * sizeof(float), hostA.data(), 0, nullptr, &writeBufferAEvent)); + + TracyCLZoneSetEvent(writeBufferAEvent); + } + { + ZoneScopedN("Write Buffer B"); + TracyCLZone(tracyCLCtx, "Write BufferB"); + + CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferB, CL_TRUE, 0, N * sizeof(float), hostB.data(), 0, nullptr, &writeBufferBEvent)); + + TracyCLZoneSetEvent(writeBufferBEvent); + } + } + + for (int i = 0; i < 10; ++i) + { + ZoneScopedN("VectorAdd Kernel Launch"); + TracyCLZoneC(tracyCLCtx, "VectorAdd Kernel", tracy::Color::Blue4); + + CL_ASSERT(clSetKernelArg(vectorAddKernel, 0, sizeof(cl_mem), &bufferC)); + CL_ASSERT(clSetKernelArg(vectorAddKernel, 1, sizeof(cl_mem), &bufferA)); + CL_ASSERT(clSetKernelArg(vectorAddKernel, 2, sizeof(cl_mem), &bufferB)); + CL_ASSERT(clSetKernelArg(vectorAddKernel, 3, sizeof(int), &static_cast(N))); + + cl_event vectorAddKernelEvent; + CL_ASSERT(clEnqueueNDRangeKernel(commandQueue, vectorAddKernel, 1, nullptr, &N, nullptr, 0, nullptr, &vectorAddKernelEvent)); + + CL_ASSERT(clWaitForEvents(1, &vectorAddKernelEvent)); + + TracyCLZoneSetEvent(vectorAddKernelEvent); + + cl_ulong kernelStartTime, kernelEndTime; + CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelStartTime, nullptr)); + CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, nullptr)); + std::cout << "VectorAdd Kernel Elapsed: " << ((kernelEndTime - kernelStartTime) / 1000) << " us" << std::endl; + } + + { + ZoneScopedN("Device to Host Memory Copy"); + TracyCLZone(tracyCLCtx, "Read Buffer C"); + + cl_event readbufferCEvent; + CL_ASSERT(clEnqueueReadBuffer(commandQueue, bufferC, CL_TRUE, 0, N * sizeof(float), hostC.data(), 0, nullptr, &readbufferCEvent)); + TracyCLZoneSetEvent(readbufferCEvent); + } + + CL_ASSERT(clFinish(commandQueue)); + + TracyCLCollect(tracyCLCtx); + + { + ZoneScopedN("Checking results"); + + for (int i = 0; i < N; ++i) + { + assert(hostC[i] == hostA[i] + hostB[i]); + } + } + + std::cout << "Results are correct!" << std::endl; + + TracyCLDestroy(tracyCLCtx); + + return 0; +} diff --git a/manual/tracy.tex b/manual/tracy.tex index 6fe40bf5..b53ad6a3 100644 --- a/manual/tracy.tex +++ b/manual/tracy.tex @@ -1197,6 +1197,16 @@ To mark a GPU zone use the \texttt{TracyVkZone(ctx, cmdbuf, name)} macro, where You also need to periodically collect the GPU events using the \texttt{TracyVkCollect(ctx, cmdbuf)} macro\footnote{It is considerably faster than the OpenGL's \texttt{TracyGpuCollect}.}. The provided command buffer must be in the recording state and outside of a render pass instance. +\subsubsection{OpenCL} + +OpenCL support is achieved by including the \texttt{tracy/TracyOpenCL.hpp} header file. Tracing OpenCL requires the creation of a Tracy OpenCL context using the macro \texttt{TracyCLContext(context, device)}, which will return an instance of \texttt{TracyCLCtx} object that must be used when creating zones. The specified \texttt{device} must be part of the \texttt{context}. Cleanup is performed using the \texttt{TracyCLDestroy(ctx)} macro. Although not common, it is possible to create multiple OpenCL contexts for the same application. + +To mark an OpenCL zone one must make sure that a valid OpenCL \texttt{cl\_event} object is available. The event will be the object that Tracy will use to query profiling information from the OpenCL driver. For this to work, all OpenCL queues must be created with the \texttt{CL\_QUEUE\_PROFILING\_ENABLE} property. + +OpenCL zones can be created with the \texttt{TracyCLZone(ctx, name)} where \texttt{name} will usually be a descriptive name for the operation represented by the \texttt{cl\_event}. Within the scope of the zone, you must call \texttt{TracyCLSetEvent(event)} for the event to be registered in Tracy. + +Similarly to Vulkan and OpenGL, you also need to periodically collect the OpenCL events using the \texttt{TracyCLCollect(ctx)} macro. A good place to perform this operation is after a \texttt{clFinish}, since this will ensure that any previous queued OpenCL commands will have finished by this point. + \subsubsection{Multiple zones in one scope} Putting more than one GPU zone macro in a single scope features the same issue as with the \texttt{ZoneScoped} macros, described in section~\ref{multizone} (but this time the variable name is \texttt{\_\_\_tracy\_gpu\_zone}). diff --git a/server/TracyView.cpp b/server/TracyView.cpp index 57581311..646c6267 100644 --- a/server/TracyView.cpp +++ b/server/TracyView.cpp @@ -75,7 +75,8 @@ constexpr const char* s_tracyStackFrames[] = { constexpr const char* GpuContextNames[] = { "Invalid", "OpenGL", - "Vulkan" + "Vulkan", + "OpenCL" }; @@ -2472,7 +2473,7 @@ void View::DrawZones() draw->AddTriangle( wpos + ImVec2( to/2, oldOffset + to/2 ), wpos + ImVec2( to/2, oldOffset + ty - to/2 ), wpos + ImVec2( to/2 + th, oldOffset + ty * 0.5 ), 0xFF886666, 2.0f ); } - const bool isMultithreaded = v->type == GpuContextType::Vulkan; + const bool isMultithreaded = (v->type == GpuContextType::Vulkan) || (v->type == GpuContextType::OpenCL); char buf[64]; sprintf( buf, "%s context %zu", GpuContextNames[(int)v->type], i ); DrawTextContrast( draw, wpos + ImVec2( ty, oldOffset ), showFull ? 0xFFFFAAAA : 0xFF886666, buf ); From 22ef78333d308adff39e2cc098a58766dd0bac85 Mon Sep 17 00:00:00 2001 From: Logan Buchy Date: Fri, 5 Jun 2020 21:30:16 -0700 Subject: [PATCH 02/13] Fix missing comma in ZoneScopedNCS macro --- Tracy.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tracy.hpp b/Tracy.hpp index 7636578e..6425356e 100644 --- a/Tracy.hpp +++ b/Tracy.hpp @@ -150,7 +150,7 @@ # define ZoneScopedS( depth ) ZoneNamedS( ___tracy_scoped_zone, depth, true ) # define ZoneScopedNS( name, depth ) ZoneNamedNS( ___tracy_scoped_zone, name, depth, true ) # define ZoneScopedCS( color, depth ) ZoneNamedCS( ___tracy_scoped_zone, color, depth, true ) -# define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color depth, true ) +# define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color, depth, true ) # define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth ); # define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth ); From ecfeb01aad1fd06cf126a9a23cc72293994bfb02 Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Sat, 6 Jun 2020 12:31:47 +0200 Subject: [PATCH 03/13] Set source view content width to max value, regardless of clipping. --- server/TracySourceView.cpp | 15 +++++++++++++-- server/TracySourceView.hpp | 3 +++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/server/TracySourceView.cpp b/server/TracySourceView.cpp index b9a74c32..5da3835c 100644 --- a/server/TracySourceView.cpp +++ b/server/TracySourceView.cpp @@ -436,6 +436,7 @@ void SourceView::ParseSource( const char* fileName, const Worker& worker, const { if( m_file != fileName ) { + m_srcWidth = 0; m_file = fileName; m_fileStringIdx = worker.FindStringIdx( fileName ); m_lines.clear(); @@ -510,6 +511,7 @@ bool SourceView::Disassemble( uint64_t symAddr, const Worker& worker ) m_maxJumpLevel = 0; m_asmSelected = -1; m_asmCountBase = -1; + m_asmWidth = 0; if( symAddr == 0 ) return false; m_cpuArch = worker.GetCpuArch(); if( m_cpuArch == CpuArchUnknown ) return false; @@ -889,6 +891,7 @@ void SourceView::Render( const Worker& worker, View& view ) void SourceView::RenderSimpleSourceView() { + ImGui::SetNextWindowContentSize( ImVec2( m_srcWidth, 0 ) ); ImGui::BeginChild( "##sourceView", ImVec2( 0, 0 ), true, ImGuiWindowFlags_HorizontalScrollbar ); if( m_font ) ImGui::PushFont( m_font ); @@ -915,6 +918,8 @@ void SourceView::RenderSimpleSourceView() } RenderLine( line, lineNum++, 0, 0, 0, nullptr ); } + const auto& win = ImGui::GetCurrentWindow(); + m_srcWidth = win->DC.CursorMaxPos.x - win->DC.CursorStartPos.x; } else { @@ -1376,6 +1381,7 @@ void SourceView::RenderSymbolSourceView( uint32_t iptotal, unordered_flat_mapDC.CursorMaxPos.x - win->DC.CursorStartPos.x; } else { @@ -1654,6 +1662,7 @@ uint64_t SourceView::RenderSymbolAsmView( uint32_t iptotal, unordered_flat_mapDC.CursorMaxPos.x - win->DC.CursorStartPos.x; } else { @@ -2080,7 +2091,7 @@ void SourceView::RenderLine( const Line& line, int lineNum, uint32_t ipcnt, uint { const auto ty = ImGui::GetFontSize(); auto draw = ImGui::GetWindowDrawList(); - const auto w = ImGui::GetWindowWidth(); + const auto w = m_srcWidth; const auto wpos = ImGui::GetCursorScreenPos(); if( m_fileStringIdx == m_hoveredSource && lineNum == m_hoveredLine ) { @@ -2257,7 +2268,7 @@ void SourceView::RenderAsmLine( AsmLine& line, uint32_t ipcnt, uint32_t iptotal, { const auto ty = ImGui::GetFontSize(); auto draw = ImGui::GetWindowDrawList(); - const auto w = ImGui::GetWindowWidth(); + const auto w = m_asmWidth; const auto wpos = ImGui::GetCursorScreenPos(); if( m_selectedAddressesHover.find( line.addr ) != m_selectedAddressesHover.end() ) { diff --git a/server/TracySourceView.hpp b/server/TracySourceView.hpp index 3fce239b..75b80c54 100644 --- a/server/TracySourceView.hpp +++ b/server/TracySourceView.hpp @@ -239,6 +239,9 @@ private: unordered_flat_set m_srcSampleSelect; uint32_t m_asmGroupSelect = -1; uint32_t m_srcGroupSelect = -1; + + float m_srcWidth; + float m_asmWidth; }; } From 06158de6da69519a6d6335719930c6d91f3cfa9e Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Sat, 6 Jun 2020 12:36:44 +0200 Subject: [PATCH 04/13] Update README. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index db63cac8..b5933048 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Sponsor](.github/sponsor.png)](https://github.com/sponsors/wolfpld/) -### A real time, nanosecond resolution, remote telemetry frame profiler for games and other applications. +### A real time, nanosecond resolution, remote telemetry, hybrid frame and sampling profiler for games and other applications. Tracy supports profiling CPU (C, C++11, Lua), GPU (OpenGL, Vulkan), memory, locks, context switches, per-frame screenshots and more. From d35d9b60ffb28becbe1a0e8b556e206f62bd9f80 Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Sat, 6 Jun 2020 14:57:48 +0200 Subject: [PATCH 05/13] Bump protocol and version for OpenCL support. --- common/TracyProtocol.hpp | 2 +- server/TracyVersion.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/TracyProtocol.hpp b/common/TracyProtocol.hpp index d30aad0e..14deebc2 100644 --- a/common/TracyProtocol.hpp +++ b/common/TracyProtocol.hpp @@ -9,7 +9,7 @@ namespace tracy constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; } -enum : uint32_t { ProtocolVersion = 33 }; +enum : uint32_t { ProtocolVersion = 34 }; enum : uint32_t { BroadcastVersion = 1 }; using lz4sz_t = uint32_t; diff --git a/server/TracyVersion.hpp b/server/TracyVersion.hpp index d040147a..eb2542f9 100644 --- a/server/TracyVersion.hpp +++ b/server/TracyVersion.hpp @@ -7,7 +7,7 @@ namespace Version { enum { Major = 0 }; enum { Minor = 6 }; -enum { Patch = 14 }; +enum { Patch = 15 }; } } From eb497f2b9fe60dedf7a2a48597453316214fe132 Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Sat, 6 Jun 2020 15:00:57 +0200 Subject: [PATCH 06/13] Symbol resolution should be possible on iOS. --- manual/tracy.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manual/tracy.tex b/manual/tracy.tex index b53ad6a3..33f8140e 100644 --- a/manual/tracy.tex +++ b/manual/tracy.tex @@ -691,7 +691,7 @@ Memory & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\ GPU zones (OpenGL) & \faCheck & \faCheck & \faCheck & \faPoo & \faPoo & \\ GPU zones (Vulkan) & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \\ Call stacks & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\ -Symbol resolution & \faCheck & \faCheck & \faCheck & \faCheck & \faPoo & \faCheck \\ +Symbol resolution & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\ Crash handling & \faCheck & \faCheck & \faCheck & \faTimes & \faTimes & \faTimes \\ CPU usage probing & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\ Context switches & \faCheck & \faCheck & \faCheck & \faTimes & \faPoo & \faTimes \\ From e78bbf3492ca864ab1db2f860613c86b564ae45c Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Sat, 6 Jun 2020 15:35:58 +0200 Subject: [PATCH 07/13] Update NEWS. --- NEWS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS b/NEWS index eba7e3e4..f8cdb644 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,9 @@ a mistake. v0.7 (xxxx-xx-xx) ----------------- +This is the last release which will be able to load pre-v0.6 traces. Use the +update utility to convert your old traces now! + - chrome:tracing importer now imports zone metadata from "args" key. - Added display of statistical mode to find zone menu. - Automatic stack sampling is now available on windows. @@ -61,6 +64,7 @@ v0.7 (xxxx-xx-xx) - Added ability to send simple integral values as extra payload for zones. - Per-frame zone times on the frames plot can now display self time. - Ability to bind only on localhost interface. +- OpenCL profiling. v0.6.3 (2020-02-13) ------------------- From bee70ee72bff055d9fdc9738fa56cf943e93d41d Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Sat, 6 Jun 2020 15:37:16 +0200 Subject: [PATCH 08/13] Add OpenCL to description. --- README.md | 2 +- manual/tracy.tex | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b5933048..fb07b323 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ### A real time, nanosecond resolution, remote telemetry, hybrid frame and sampling profiler for games and other applications. -Tracy supports profiling CPU (C, C++11, Lua), GPU (OpenGL, Vulkan), memory, locks, context switches, per-frame screenshots and more. +Tracy supports profiling CPU (C, C++11, Lua), GPU (OpenGL, Vulkan, OpenCL), memory, locks, context switches, per-frame screenshots and more. For usage **and build process** instructions, consult the user manual [at the following address](https://github.com/wolfpld/tracy/releases). diff --git a/manual/tracy.tex b/manual/tracy.tex index 33f8140e..a7a23299 100644 --- a/manual/tracy.tex +++ b/manual/tracy.tex @@ -117,7 +117,7 @@ Hello and welcome to the Tracy Profiler user manual! Here you will find all the \section{A quick look at Tracy Profiler} \label{quicklook} -Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that can be used for remote or embedded telemetry of games and other applications. It can profile CPU (C, C++11, Lua), GPU (OpenGL, Vulkan) and memory. It also can monitor locks held by threads and show where contention does happen. +Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that can be used for remote or embedded telemetry of games and other applications. It can profile CPU (C, C++11, Lua), GPU (OpenGL, Vulkan, OpenCL) and memory. It also can monitor locks held by threads and show where contention does happen. While Tracy can perform statistical analysis of sampled call stack data, just like other \emph{statistical profilers} (such as VTune, perf or Very Sleepy), it mainly focuses on manual markup of the source code, which allows frame-by-frame inspection of the program execution. You will be able to see exactly which functions are called, how much time is spent in them, and how do they interact with each other in a multi-threaded environment. In contrast, the statistical analysis may show you the hot spots in your code, but it is unable to accurately pinpoint the underlying cause for semi-random frame stutter that may occur every couple of seconds. From 9c49ee3dd3c51a02325a2f4d72deea6d49ca9217 Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Sat, 6 Jun 2020 20:46:46 +0200 Subject: [PATCH 09/13] Don't mark windows as write-modified when only reading data. --- server/TracySourceView.cpp | 10 +++++----- server/TracyView.cpp | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/server/TracySourceView.cpp b/server/TracySourceView.cpp index 5da3835c..8fb30fb5 100644 --- a/server/TracySourceView.cpp +++ b/server/TracySourceView.cpp @@ -918,7 +918,7 @@ void SourceView::RenderSimpleSourceView() } RenderLine( line, lineNum++, 0, 0, 0, nullptr ); } - const auto& win = ImGui::GetCurrentWindow(); + const auto win = ImGui::GetCurrentWindowRead(); m_srcWidth = win->DC.CursorMaxPos.x - win->DC.CursorStartPos.x; } else @@ -1416,7 +1416,7 @@ void SourceView::RenderSymbolSourceView( uint32_t iptotal, unordered_flat_mapDC.CursorMaxPos.x - win->DC.CursorStartPos.x; } else @@ -1443,7 +1443,7 @@ void SourceView::RenderSymbolSourceView( uint32_t iptotal, unordered_flat_mapScrollbarY ) { auto draw = ImGui::GetWindowDrawList(); @@ -1689,7 +1689,7 @@ uint64_t SourceView::RenderSymbolAsmView( uint32_t iptotal, unordered_flat_mapDC.CursorMaxPos.x - win->DC.CursorStartPos.x; } else @@ -1859,7 +1859,7 @@ uint64_t SourceView::RenderSymbolAsmView( uint32_t iptotal, unordered_flat_mapScrollbarY ) { auto draw = ImGui::GetWindowDrawList(); diff --git a/server/TracyView.cpp b/server/TracyView.cpp index dce59576..63580b12 100644 --- a/server/TracyView.cpp +++ b/server/TracyView.cpp @@ -1293,7 +1293,7 @@ void View::DrawFrames() enum { MaxFrameTime = 50 * 1000 * 1000 }; // 50ms - ImGuiWindow* window = ImGui::GetCurrentWindow(); + ImGuiWindow* window = ImGui::GetCurrentWindowRead(); if( window->SkipItems ) return; auto& io = ImGui::GetIO(); @@ -2300,7 +2300,7 @@ void View::DrawZones() if( m_vd.zvStart == m_vd.zvEnd ) return; assert( m_vd.zvStart < m_vd.zvEnd ); - if( ImGui::GetCurrentWindow()->SkipItems ) return; + if( ImGui::GetCurrentWindowRead()->SkipItems ) return; m_gpuThread = 0; m_gpuStart = 0; From c384ec132f4d2cb7cb79b0ee622f5141fd01f3dd Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Sat, 6 Jun 2020 20:50:24 +0200 Subject: [PATCH 10/13] Fix position of source separator line. --- server/TracySourceView.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/TracySourceView.cpp b/server/TracySourceView.cpp index 8fb30fb5..f8decbe0 100644 --- a/server/TracySourceView.cpp +++ b/server/TracySourceView.cpp @@ -1386,7 +1386,7 @@ void SourceView::RenderSymbolSourceView( uint32_t iptotal, unordered_flat_mapScroll.x, 0 ); const auto wh = ImGui::GetWindowHeight(); const auto ty = ImGui::GetFontSize(); const auto ts = ImGui::CalcTextSize( " " ).x; From 929d399995554d5a3750091f6b8b81264eef111f Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Mon, 8 Jun 2020 14:02:11 +0200 Subject: [PATCH 11/13] Fix determination of line width. --- server/TracySourceView.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/TracySourceView.cpp b/server/TracySourceView.cpp index f8decbe0..74c96397 100644 --- a/server/TracySourceView.cpp +++ b/server/TracySourceView.cpp @@ -2091,7 +2091,7 @@ void SourceView::RenderLine( const Line& line, int lineNum, uint32_t ipcnt, uint { const auto ty = ImGui::GetFontSize(); auto draw = ImGui::GetWindowDrawList(); - const auto w = m_srcWidth; + const auto w = std::max( m_srcWidth, ImGui::GetWindowWidth() ); const auto wpos = ImGui::GetCursorScreenPos(); if( m_fileStringIdx == m_hoveredSource && lineNum == m_hoveredLine ) { @@ -2268,7 +2268,7 @@ void SourceView::RenderAsmLine( AsmLine& line, uint32_t ipcnt, uint32_t iptotal, { const auto ty = ImGui::GetFontSize(); auto draw = ImGui::GetWindowDrawList(); - const auto w = m_asmWidth; + const auto w = std::max( m_asmWidth, ImGui::GetWindowWidth() ); const auto wpos = ImGui::GetCursorScreenPos(); if( m_selectedAddressesHover.find( line.addr ) != m_selectedAddressesHover.end() ) { From 1e8c842444260fc844a5e47049e29135135baeac Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Mon, 8 Jun 2020 18:27:41 +0200 Subject: [PATCH 12/13] Update manual. --- manual/tracy.tex | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/manual/tracy.tex b/manual/tracy.tex index a7a23299..e44f376b 100644 --- a/manual/tracy.tex +++ b/manual/tracy.tex @@ -1279,6 +1279,15 @@ To have proper call stack information, the profiled application must be compiled \item On MSVC open the project properties and go to \emph{Linker\textrightarrow Debugging\textrightarrow Generate Debug Info}, where the \emph{Generate Debug Information} option should be selected. \item On gcc or clang remember to specify the debugging information \texttt{-g} parameter during compilation and omit the strip symbols \texttt{-s} parameter. Link the executable with an additional option \texttt{-rdynamic} (or \texttt{-{}-export-dynamic}, if you are passing parameters directly to the linker). \item On OSX you may need to run \texttt{dsymutil} to extract the debugging data out of the executable binary. +\item On iOS you will have to add a \emph{New Run Script Phase} to your XCode project, which will execute the following shell script: + +\begin{lstlisting}[language=sh] +cp -rf ${TARGET_BUILD_DIR}/${WRAPPER_NAME}.dSYM/* ${TARGET_BUILD_DIR}/${UNLOCALIZED_RESOURCES_FOLDER_PATH}/${PRODUCT_NAME}.dSYM +\end{lstlisting} + +You will also need to setup proper dependencies, by setting the following input file:\\ +\texttt{\$\{TARGET\_BUILD\_DIR\}/\$\{WRAPPER\_NAME\}.dSYM}, and the following output file:\\ +\texttt{\$\{TARGET\_BUILD\_DIR\}/\$\{UNLOCALIZED\_RESOURCES\_FOLDER\_PATH\}/\$\{PRODUCT\_NAME\}.dSYM}. \end{itemize} You may also be interested in symbols from external libraries, especially if you have sampling profiling enabled (section~\ref{sampling}). In MSVC you can retrieve such symbols by going to \emph{Tools\textrightarrow Options\textrightarrow Debugging\textrightarrow Symbols} and selecting appropriate \emph{Symbol file (.pdb) location} servers. Note that additional symbols may significantly increase application startup times. From 284d49b34ba160b3f8e7b555bf162bf767e978ff Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Mon, 8 Jun 2020 19:35:42 +0200 Subject: [PATCH 13/13] Change rdtscp check to rdtsc check. --- client/TracyProfiler.cpp | 2 +- manual/tracy.tex | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/client/TracyProfiler.cpp b/client/TracyProfiler.cpp index 105411a5..812eef2e 100644 --- a/client/TracyProfiler.cpp +++ b/client/TracyProfiler.cpp @@ -224,7 +224,7 @@ static int64_t SetupHwTimer() #ifndef TRACY_TIMER_QPC uint32_t regs[4]; CpuId( regs, 0x80000001 ); - if( !( regs[3] & ( 1 << 27 ) ) ) InitFailure( "CPU doesn't support RDTSCP instruction." ); + if( !( regs[3] & ( 1 << 4 ) ) ) InitFailure( "CPU doesn't support RDTSC instruction." ); CpuId( regs, 0x80000007 ); if( !( regs[3] & ( 1 << 8 ) ) ) { diff --git a/manual/tracy.tex b/manual/tracy.tex index e44f376b..65882ca2 100644 --- a/manual/tracy.tex +++ b/manual/tracy.tex @@ -452,7 +452,7 @@ The first command will allow access to system CPU statistics. The second one wil \paragraph{Cloud service providers} -In some cases you actually don't own the hardware, but lend it from someone else. In such circumstances you might be running inside a virtual machine, which may be configured to prohibit you from using the bare metal facilities needed by Tracy\footnote{Or you might just be using a quite old CPU, which doesn't have support for required features.}. One example of such limitation would be lack of access to a reliable time stamp register readings, which will prevent the application from starting with either 'CPU doesn't support RDTSCP instruction' or 'CPU doesn't support invariant TSC' error message. If you are using Windows, you may workaround this issue by rebuilding the profiled application with the \texttt{TRACY\_TIMER\_QPC} macro, but be aware that it will severely lower the resolution of timer readings. +In some cases you actually don't own the hardware, but lend it from someone else. In such circumstances you might be running inside a virtual machine, which may be configured to prohibit you from using the bare metal facilities needed by Tracy\footnote{Or you might just be using a quite old CPU, which doesn't have support for required features.}. One example of such limitation would be lack of access to a reliable time stamp register readings, which will prevent the application from starting with either 'CPU doesn't support RDTSC instruction' or 'CPU doesn't support invariant TSC' error message. If you are using Windows, you may workaround this issue by rebuilding the profiled application with the \texttt{TRACY\_TIMER\_QPC} macro, but be aware that it will severely lower the resolution of timer readings. \subsubsection{Changing network port}