mirror of
https://github.com/wolfpld/tracy
synced 2025-04-29 20:33:52 +00:00
commit
f3f5f1dab8
@ -35,12 +35,28 @@ using TracyCLCtx = void*;
|
|||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
#include "Tracy.hpp"
|
#include "Tracy.hpp"
|
||||||
#include "client/TracyCallstack.hpp"
|
#include "client/TracyCallstack.hpp"
|
||||||
#include "client/TracyProfiler.hpp"
|
#include "client/TracyProfiler.hpp"
|
||||||
#include "common/TracyAlloc.hpp"
|
#include "common/TracyAlloc.hpp"
|
||||||
|
|
||||||
|
#define TRACY_CL_TO_STRING_INDIRECT(T) #T
|
||||||
|
#define TRACY_CL_TO_STRING(T) TRACY_CL_TO_STRING_INDIRECT(T)
|
||||||
|
#define TRACY_CL_ASSERT(p) if(!(p)) { \
|
||||||
|
TracyMessageL( "TRACY_CL_ASSERT failed on " __FILE__ ":" TRACY_CL_TO_STRING(__LINE__) ); \
|
||||||
|
assert(false && "TRACY_CL_ASSERT failed"); \
|
||||||
|
}
|
||||||
|
#define TRACY_CL_CHECK_ERROR(err) if(err != CL_SUCCESS) { \
|
||||||
|
std::ostringstream oss; \
|
||||||
|
oss << "TRACY_CL_CHECK_ERROR failed on " << __FILE__ << ":" << __LINE__ \
|
||||||
|
<< ": error code " << err; \
|
||||||
|
auto msg = oss.str(); \
|
||||||
|
TracyMessage(msg.data(), msg.size()); \
|
||||||
|
assert(false && "TRACY_CL_CHECK_ERROR failed"); \
|
||||||
|
}
|
||||||
|
|
||||||
namespace tracy {
|
namespace tracy {
|
||||||
|
|
||||||
enum class EventPhase : uint8_t
|
enum class EventPhase : uint8_t
|
||||||
@ -66,34 +82,27 @@ namespace tracy {
|
|||||||
, m_tail(0)
|
, m_tail(0)
|
||||||
{
|
{
|
||||||
int64_t tcpu, tgpu;
|
int64_t tcpu, tgpu;
|
||||||
assert(m_contextId != 255);
|
TRACY_CL_ASSERT(m_contextId != 255);
|
||||||
|
|
||||||
cl_int err = CL_SUCCESS;
|
cl_int err = CL_SUCCESS;
|
||||||
cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
|
cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
|
||||||
assert(err == CL_SUCCESS);
|
TRACY_CL_CHECK_ERROR(err)
|
||||||
uint32_t dummyValue = 42;
|
uint32_t dummyValue = 42;
|
||||||
cl_mem dummyBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(uint32_t), nullptr, &err);
|
cl_mem dummyBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(uint32_t), nullptr, &err);
|
||||||
assert(err == CL_SUCCESS);
|
TRACY_CL_CHECK_ERROR(err)
|
||||||
cl_event writeBufferEvent;
|
cl_event writeBufferEvent;
|
||||||
err = clEnqueueWriteBuffer(queue, dummyBuffer, CL_FALSE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent);
|
TRACY_CL_CHECK_ERROR(clEnqueueWriteBuffer(queue, dummyBuffer, CL_FALSE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent));
|
||||||
assert(err == CL_SUCCESS);
|
TRACY_CL_CHECK_ERROR(clWaitForEvents(1, &writeBufferEvent));
|
||||||
err = clWaitForEvents(1, &writeBufferEvent);
|
|
||||||
|
|
||||||
tcpu = Profiler::GetTime();
|
tcpu = Profiler::GetTime();
|
||||||
|
|
||||||
assert(err == CL_SUCCESS);
|
|
||||||
cl_int eventStatus;
|
cl_int eventStatus;
|
||||||
err = clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
|
TRACY_CL_CHECK_ERROR(clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr));
|
||||||
assert(err == CL_SUCCESS);
|
TRACY_CL_ASSERT(eventStatus == CL_COMPLETE);
|
||||||
assert(eventStatus == CL_COMPLETE);
|
TRACY_CL_CHECK_ERROR(clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &tgpu, nullptr));
|
||||||
err = clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &tgpu, nullptr);
|
TRACY_CL_CHECK_ERROR(clReleaseEvent(writeBufferEvent));
|
||||||
assert(err == CL_SUCCESS);
|
TRACY_CL_CHECK_ERROR(clReleaseMemObject(dummyBuffer));
|
||||||
err = clReleaseEvent(writeBufferEvent);
|
TRACY_CL_CHECK_ERROR(clReleaseCommandQueue(queue));
|
||||||
assert(err == CL_SUCCESS);
|
|
||||||
err = clReleaseMemObject(dummyBuffer);
|
|
||||||
assert(err == CL_SUCCESS);
|
|
||||||
err = clReleaseCommandQueue(queue);
|
|
||||||
assert(err == CL_SUCCESS);
|
|
||||||
|
|
||||||
auto item = Profiler::QueueSerial();
|
auto item = Profiler::QueueSerial();
|
||||||
MemWrite(&item->hdr.type, QueueType::GpuNewContext);
|
MemWrite(&item->hdr.type, QueueType::GpuNewContext);
|
||||||
@ -139,13 +148,23 @@ namespace tracy {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
while (m_tail != m_head)
|
for (; m_tail != m_head; m_tail = (m_tail + 1) % QueryCount)
|
||||||
{
|
{
|
||||||
EventInfo eventInfo = m_query[m_tail];
|
EventInfo eventInfo = GetQuery(m_tail);
|
||||||
cl_event event = eventInfo.event;
|
|
||||||
cl_int eventStatus;
|
cl_int eventStatus;
|
||||||
cl_int err = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
|
cl_int err = clGetEventInfo(eventInfo.event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
|
||||||
assert(err == CL_SUCCESS);
|
if (err != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
std::ostringstream oss;
|
||||||
|
oss << "clGetEventInfo falied with error code " << err << ", on event " << eventInfo.event << ", skipping...";
|
||||||
|
auto msg = oss.str();
|
||||||
|
TracyMessage(msg.data(), msg.size());
|
||||||
|
if (eventInfo.event == nullptr) {
|
||||||
|
TracyMessageL("A TracyCLZone must be paird with a TracyCLZoneSetEvent, check your code!");
|
||||||
|
}
|
||||||
|
assert(false && "clGetEventInfo failed, maybe a TracyCLZone is not paired with TracyCLZoneSetEvent");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if (eventStatus != CL_COMPLETE) return;
|
if (eventStatus != CL_COMPLETE) return;
|
||||||
|
|
||||||
cl_int eventInfoQuery = (eventInfo.phase == EventPhase::Begin)
|
cl_int eventInfoQuery = (eventInfo.phase == EventPhase::Begin)
|
||||||
@ -153,9 +172,16 @@ namespace tracy {
|
|||||||
: CL_PROFILING_COMMAND_END;
|
: CL_PROFILING_COMMAND_END;
|
||||||
|
|
||||||
cl_ulong eventTimeStamp = 0;
|
cl_ulong eventTimeStamp = 0;
|
||||||
err = clGetEventProfilingInfo(event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr);
|
err = clGetEventProfilingInfo(eventInfo.event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr);
|
||||||
assert(err == CL_SUCCESS);
|
if (err == CL_PROFILING_INFO_NOT_AVAILABLE)
|
||||||
assert(eventTimeStamp != 0);
|
{
|
||||||
|
TracyMessageL("command queue is not created with CL_QUEUE_PROFILING_ENABLE flag, check your code!");
|
||||||
|
assert(false && "command queue is not created with CL_QUEUE_PROFILING_ENABLE flag");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
TRACY_CL_CHECK_ERROR(err);
|
||||||
|
|
||||||
|
TRACY_CL_ASSERT(eventTimeStamp != 0);
|
||||||
|
|
||||||
auto item = Profiler::QueueSerial();
|
auto item = Profiler::QueueSerial();
|
||||||
MemWrite(&item->hdr.type, QueueType::GpuTime);
|
MemWrite(&item->hdr.type, QueueType::GpuTime);
|
||||||
@ -167,11 +193,8 @@ namespace tracy {
|
|||||||
if (eventInfo.phase == EventPhase::End)
|
if (eventInfo.phase == EventPhase::End)
|
||||||
{
|
{
|
||||||
// Done with the event, so release it
|
// Done with the event, so release it
|
||||||
err = clReleaseEvent(event);
|
TRACY_CL_CHECK_ERROR(clReleaseEvent(eventInfo.event));
|
||||||
assert(err == CL_SUCCESS);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
m_tail = (m_tail + 1) % QueryCount;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -184,14 +207,14 @@ namespace tracy {
|
|||||||
{
|
{
|
||||||
const auto id = m_head;
|
const auto id = m_head;
|
||||||
m_head = (m_head + 1) % QueryCount;
|
m_head = (m_head + 1) % QueryCount;
|
||||||
assert(m_head != m_tail);
|
TRACY_CL_ASSERT(m_head != m_tail);
|
||||||
m_query[id] = eventInfo;
|
m_query[id] = eventInfo;
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
tracy_force_inline EventInfo& GetQuery(unsigned int id)
|
tracy_force_inline EventInfo& GetQuery(unsigned int id)
|
||||||
{
|
{
|
||||||
assert(id < QueryCount);
|
TRACY_CL_ASSERT(id < QueryCount);
|
||||||
return m_query[id];
|
return m_query[id];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -200,8 +223,8 @@ namespace tracy {
|
|||||||
unsigned int m_contextId;
|
unsigned int m_contextId;
|
||||||
|
|
||||||
EventInfo m_query[QueryCount];
|
EventInfo m_query[QueryCount];
|
||||||
unsigned int m_head;
|
unsigned int m_head; // index at which a new event should be inserted
|
||||||
unsigned int m_tail;
|
unsigned int m_tail; // oldest event
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -259,8 +282,7 @@ namespace tracy {
|
|||||||
{
|
{
|
||||||
if (!m_active) return;
|
if (!m_active) return;
|
||||||
m_event = event;
|
m_event = event;
|
||||||
cl_int err = clRetainEvent(m_event);
|
TRACY_CL_CHECK_ERROR(clRetainEvent(m_event));
|
||||||
assert(err == CL_SUCCESS);
|
|
||||||
m_ctx->GetQuery(m_beginQueryId).event = m_event;
|
m_ctx->GetQuery(m_beginQueryId).event = m_event;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include <algorithm>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -104,8 +105,8 @@ int main()
|
|||||||
hostB.resize(N);
|
hostB.resize(N);
|
||||||
hostC.resize(N);
|
hostC.resize(N);
|
||||||
|
|
||||||
std::iota(std::begin(hostA), std::end(hostA), 0);
|
std::iota(std::begin(hostA), std::end(hostA), 0.0f);
|
||||||
std::iota(std::begin(hostB), std::end(hostB), 0);
|
std::iota(std::begin(hostB), std::end(hostB), 0.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
@ -123,7 +124,7 @@ int main()
|
|||||||
ZoneScopedN("Write Buffer A");
|
ZoneScopedN("Write Buffer A");
|
||||||
TracyCLZoneS(tracyCLCtx, "Write BufferA", 5);
|
TracyCLZoneS(tracyCLCtx, "Write BufferA", 5);
|
||||||
|
|
||||||
CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferA, CL_TRUE, 0, N * sizeof(float), hostA.data(), 0, nullptr, &writeBufferAEvent));
|
CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferA, CL_FALSE, 0, N * sizeof(float), hostA.data(), 0, nullptr, &writeBufferAEvent));
|
||||||
|
|
||||||
TracyCLZoneSetEvent(writeBufferAEvent);
|
TracyCLZoneSetEvent(writeBufferAEvent);
|
||||||
}
|
}
|
||||||
@ -131,34 +132,44 @@ int main()
|
|||||||
ZoneScopedN("Write Buffer B");
|
ZoneScopedN("Write Buffer B");
|
||||||
TracyCLZone(tracyCLCtx, "Write BufferB");
|
TracyCLZone(tracyCLCtx, "Write BufferB");
|
||||||
|
|
||||||
CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferB, CL_TRUE, 0, N * sizeof(float), hostB.data(), 0, nullptr, &writeBufferBEvent));
|
CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferB, CL_FALSE, 0, N * sizeof(float), hostB.data(), 0, nullptr, &writeBufferBEvent));
|
||||||
|
|
||||||
TracyCLZoneSetEvent(writeBufferBEvent);
|
TracyCLZoneSetEvent(writeBufferBEvent);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < 10; ++i)
|
cl_int clN = static_cast<cl_int>(N);
|
||||||
|
const int numFrames = 10;
|
||||||
|
const int launchsPerFrame = 10;
|
||||||
|
constexpr int numLaunchs = numFrames * launchsPerFrame;
|
||||||
|
std::vector<cl_event> kernelLaunchEvts;
|
||||||
|
kernelLaunchEvts.reserve(numLaunchs);
|
||||||
|
for (int i = 0; i < numFrames; ++i)
|
||||||
{
|
{
|
||||||
int n_value = static_cast<int>(N);
|
FrameMark;
|
||||||
ZoneScopedN("VectorAdd Kernel Launch");
|
for (int j = 0; j < launchsPerFrame; ++j) {
|
||||||
TracyCLZoneC(tracyCLCtx, "VectorAdd Kernel", tracy::Color::Blue4);
|
ZoneScopedN("VectorAdd Kernel Launch");
|
||||||
|
TracyCLZoneC(tracyCLCtx, "VectorAdd Kernel", tracy::Color::Blue4);
|
||||||
|
|
||||||
CL_ASSERT(clSetKernelArg(vectorAddKernel, 0, sizeof(cl_mem), &bufferC));
|
CL_ASSERT(clSetKernelArg(vectorAddKernel, 0, sizeof(cl_mem), &bufferC));
|
||||||
CL_ASSERT(clSetKernelArg(vectorAddKernel, 1, sizeof(cl_mem), &bufferA));
|
CL_ASSERT(clSetKernelArg(vectorAddKernel, 1, sizeof(cl_mem), &bufferA));
|
||||||
CL_ASSERT(clSetKernelArg(vectorAddKernel, 2, sizeof(cl_mem), &bufferB));
|
CL_ASSERT(clSetKernelArg(vectorAddKernel, 2, sizeof(cl_mem), &bufferB));
|
||||||
CL_ASSERT(clSetKernelArg(vectorAddKernel, 3, sizeof(int), &n_value));
|
CL_ASSERT(clSetKernelArg(vectorAddKernel, 3, sizeof(cl_int), &clN));
|
||||||
|
|
||||||
cl_event vectorAddKernelEvent;
|
cl_event vectorAddKernelEvent;
|
||||||
CL_ASSERT(clEnqueueNDRangeKernel(commandQueue, vectorAddKernel, 1, nullptr, &N, nullptr, 0, nullptr, &vectorAddKernelEvent));
|
CL_ASSERT(clEnqueueNDRangeKernel(commandQueue, vectorAddKernel, 1, nullptr, &N, nullptr, 0, nullptr, &vectorAddKernelEvent));
|
||||||
|
TracyCLZoneSetEvent(vectorAddKernelEvent);
|
||||||
CL_ASSERT(clWaitForEvents(1, &vectorAddKernelEvent));
|
CL_ASSERT(clRetainEvent(vectorAddKernelEvent));
|
||||||
|
kernelLaunchEvts.push_back(vectorAddKernelEvent);
|
||||||
TracyCLZoneSetEvent(vectorAddKernelEvent);
|
std::cout << "VectorAdd Kernel Enqueued" << std::endl;
|
||||||
|
}
|
||||||
cl_ulong kernelStartTime, kernelEndTime;
|
{
|
||||||
CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelStartTime, nullptr));
|
// Wait frame events to be finished
|
||||||
CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, nullptr));
|
ZoneScopedN("clFinish");
|
||||||
std::cout << "VectorAdd Kernel Elapsed: " << ((kernelEndTime - kernelStartTime) / 1000) << " us" << std::endl;
|
CL_ASSERT(clFinish(commandQueue));
|
||||||
|
}
|
||||||
|
// You should collect on each 'frame' ends, so that streaming can be achieved.
|
||||||
|
TracyCLCollect(tracyCLCtx);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
@ -171,7 +182,25 @@ int main()
|
|||||||
}
|
}
|
||||||
|
|
||||||
CL_ASSERT(clFinish(commandQueue));
|
CL_ASSERT(clFinish(commandQueue));
|
||||||
|
std::vector<float> durations(kernelLaunchEvts.size());
|
||||||
|
for (int i=0; i<kernelLaunchEvts.size(); i++) {
|
||||||
|
cl_event evt = kernelLaunchEvts[i];
|
||||||
|
cl_ulong start;
|
||||||
|
cl_ulong end;
|
||||||
|
CL_ASSERT(clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr));
|
||||||
|
CL_ASSERT(clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, nullptr));
|
||||||
|
CL_ASSERT(clReleaseEvent(evt));
|
||||||
|
durations[i] = (end - start) * 0.001f;
|
||||||
|
std::cout << "VectorAdd Kernel " << i << " tooks " << static_cast<int>(durations[i]) << "us" << std::endl;
|
||||||
|
};
|
||||||
|
float avg = std::accumulate(durations.cbegin(), durations.cend(), 0.0f) / durations.size();
|
||||||
|
float stddev2 = std::accumulate(durations.cbegin(), durations.cend(), 0.0f, [avg](const float& acc, const float& v) {
|
||||||
|
auto d = v - avg;
|
||||||
|
return acc + d*d;
|
||||||
|
}) / (durations.size() - 1.0f);
|
||||||
|
std::cout << "VectorAdd runtime avg: " << avg << "us, std: " << sqrt(stddev2) << "us over " << numLaunchs << " runs." << std::endl;
|
||||||
|
|
||||||
|
// User should ensure all events are finished, in this case, collect after the clFinish will do the trick.
|
||||||
TracyCLCollect(tracyCLCtx);
|
TracyCLCollect(tracyCLCtx);
|
||||||
|
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user