Merge remote-tracking branch 'origin/master' into hw

This commit is contained in:
Bartosz Taudul 2021-06-02 01:12:28 +02:00
commit b7c5939bb8
No known key found for this signature in database
GPG Key ID: B7FE2008B7575DF3
21 changed files with 112 additions and 104 deletions

4
CMakeLists.txt Normal file
View File

@ -0,0 +1,4 @@
cmake_minimum_required(VERSION 3.10)
project(TracyClient LANGUAGES CXX)
add_library(TracyClient INTERFACE)
target_include_directories(TracyClient INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})

3
NEWS
View File

@ -9,6 +9,9 @@ v0.x.x (xxxx-xx-xx)
- Added TRACY_NO_CALLSTACK_INLINES macro to disable inline functions
resolution in call stacks on Windows.
- Limited client query response rate.
- Improved function matching algorithm in compare traces view.
- Added minimal CMake integration layer.
- Reworked rpmalloc initialization.
v0.7.8 (2021-05-19)

View File

@ -101,7 +101,6 @@ struct ___tracy_c_zone_context
// This struct, as visible to user, is immutable, so treat it as if const was declared here.
typedef /*const*/ struct ___tracy_c_zone_context TracyCZoneCtx;
TRACY_API void ___tracy_init_thread(void);
TRACY_API uint64_t ___tracy_alloc_srcloc( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz );
TRACY_API uint64_t ___tracy_alloc_srcloc_name( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz );

View File

@ -27,6 +27,7 @@
#include "common/TracySocket.cpp"
#include "client/tracy_rpmalloc.cpp"
#include "client/TracyDxt1.cpp"
#include "client/TracyAlloc.cpp"
#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
# include "libbacktrace/alloc.cpp"

View File

@ -271,7 +271,7 @@ public:
MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneBegin.context, ctx->GetId() );
Profiler::QueueSerialFinish();
}
@ -376,7 +376,7 @@ public:
MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneEnd.context, m_ctx->GetId() );
Profiler::QueueSerialFinish();
}
@ -389,7 +389,6 @@ private:
static inline D3D11Ctx* CreateD3D11Context( ID3D11Device* device, ID3D11DeviceContext* devicectx )
{
InitRPMallocThread();
auto ctx = (D3D11Ctx*)tracy_malloc( sizeof( D3D11Ctx ) );
new(ctx) D3D11Ctx( device, devicectx );
return ctx;

View File

@ -451,8 +451,6 @@ namespace tracy
static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue)
{
InitRPMallocThread();
auto* ctx = static_cast<D3D12QueueCtx*>(tracy_malloc(sizeof(D3D12QueueCtx)));
new (ctx) D3D12QueueCtx{ device, queue };

View File

@ -286,7 +286,6 @@ namespace tracy {
static inline OpenCLCtx* CreateCLContext(cl_context context, cl_device_id device)
{
InitRPMallocThread();
auto ctx = (OpenCLCtx*)tracy_malloc(sizeof(OpenCLCtx));
new (ctx) OpenCLCtx(context, device);
return ctx;

View File

@ -1,10 +1,6 @@
#ifndef __TRACYOPENGL_HPP__
#define __TRACYOPENGL_HPP__
#if !defined GL_TIMESTAMP && !defined GL_TIMESTAMP_EXT
# error "You must include OpenGL 3.2 headers before including TracyOpenGL.hpp"
#endif
#if !defined TRACY_ENABLE || defined __APPLE__
#define TracyGpuContext
@ -35,6 +31,10 @@ public:
#else
#if !defined GL_TIMESTAMP && !defined GL_TIMESTAMP_EXT
# error "You must include OpenGL 3.2 headers before including TracyOpenGL.hpp"
#endif
#include <atomic>
#include <assert.h>
#include <stdlib.h>
@ -53,7 +53,7 @@ public:
# define glQueryCounter glQueryCounterEXT
#endif
#define TracyGpuContext tracy::InitRPMallocThread(); tracy::GetGpuCtx().ptr = (tracy::GpuCtx*)tracy::tracy_malloc( sizeof( tracy::GpuCtx ) ); new(tracy::GetGpuCtx().ptr) tracy::GpuCtx;
#define TracyGpuContext tracy::GetGpuCtx().ptr = (tracy::GpuCtx*)tracy::tracy_malloc( sizeof( tracy::GpuCtx ) ); new(tracy::GetGpuCtx().ptr) tracy::GpuCtx;
#define TracyGpuContextName( name, size ) tracy::GetGpuCtx().ptr->Name( name, size );
#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
# define TracyGpuNamedZone( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active );

View File

@ -456,7 +456,6 @@ private:
static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct )
{
InitRPMallocThread();
auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) );
new(ctx) VkCtx( physdev, device, queue, cmdbuf, gpdctd, gct );
return ctx;

35
client/TracyAlloc.cpp Normal file
View File

@ -0,0 +1,35 @@
#ifdef TRACY_ENABLE
#include <atomic>
#include "../common/TracyAlloc.hpp"
#include "../common/TracyYield.hpp"
namespace tracy
{
extern std::atomic<int> RpInitDone;
extern std::atomic<int> RpInitLock;
TRACY_API void InitRpmallocPlumbing()
{
const auto done = RpInitDone.load( std::memory_order_acquire );
if( !done )
{
int expected = 0;
while( !RpInitLock.compare_exchange_weak( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; YieldThread(); }
const auto done = RpInitDone.load( std::memory_order_acquire );
if( !done )
{
rpmalloc_initialize();
RpInitDone.store( 1, std::memory_order_release );
}
RpInitLock.store( 0, std::memory_order_release );
}
rpmalloc_thread_initialize();
RpThreadInitDone = true;
}
}
#endif

View File

@ -64,6 +64,7 @@
#include "../common/TracyAlign.hpp"
#include "../common/TracySocket.hpp"
#include "../common/TracySystem.hpp"
#include "../common/TracyYield.hpp"
#include "../common/tracy_lz4.hpp"
#include "tracy_rpmalloc.hpp"
#include "TracyCallstack.hpp"
@ -118,45 +119,6 @@ extern "C" typedef BOOL (WINAPI *t_GetLogicalProcessorInformationEx)( LOGICAL_PR
namespace tracy
{
namespace
{
# if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA
BOOL CALLBACK InitOnceCallback( PINIT_ONCE /*initOnce*/, PVOID /*Parameter*/, PVOID* /*Context*/)
{
rpmalloc_initialize();
return TRUE;
}
INIT_ONCE InitOnce = INIT_ONCE_STATIC_INIT;
# elif defined __linux__
void InitOnceCallback()
{
rpmalloc_initialize();
}
pthread_once_t once_control = PTHREAD_ONCE_INIT;
# else
void InitOnceCallback()
{
rpmalloc_initialize();
}
std::once_flag once_flag;
# endif
}
struct RPMallocInit
{
RPMallocInit()
{
# if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA
InitOnceExecuteOnce( &InitOnce, InitOnceCallback, nullptr, nullptr );
# elif defined __linux__
pthread_once( &once_control, InitOnceCallback );
# else
std::call_once( once_flag, InitOnceCallback );
# endif
rpmalloc_thread_initialize();
}
};
#ifndef TRACY_DELAYED_INIT
struct InitTimeWrapper
@ -964,12 +926,6 @@ TRACY_API int64_t GetFrequencyQpc()
#ifdef TRACY_DELAYED_INIT
struct ThreadNameData;
TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue();
TRACY_API void InitRPMallocThread();
void InitRPMallocThread()
{
RPMallocInit rpinit;
}
struct ProfilerData
{
@ -991,7 +947,6 @@ struct ProducerWrapper
struct ProfilerThreadData
{
ProfilerThreadData( ProfilerData& data ) : token( data ), gpuCtx( { nullptr } ) {}
RPMallocInit rpmalloc_init;
ProducerWrapper token;
GpuCtxWrapper gpuCtx;
# ifdef TRACY_ON_DEMAND
@ -999,11 +954,14 @@ struct ProfilerThreadData
# endif
};
std::atomic<int> RpInitDone { 0 };
std::atomic<int> RpInitLock { 0 };
thread_local bool RpThreadInitDone = false;
# ifdef TRACY_MANUAL_LIFETIME
ProfilerData* s_profilerData = nullptr;
TRACY_API void StartupProfiler()
{
RPMallocInit init;
s_profilerData = (ProfilerData*)tracy_malloc( sizeof( ProfilerData ) );
new (s_profilerData) ProfilerData();
s_profilerData->profiler.SpawnWorkerThreads();
@ -1030,11 +988,10 @@ static ProfilerData& GetProfilerData()
if( !ptr )
{
int expected = 0;
while( !profilerDataLock.compare_exchange_strong( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; }
while( !profilerDataLock.compare_exchange_weak( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; YieldThread(); }
ptr = profilerData.load( std::memory_order_acquire );
if( !ptr )
{
RPMallocInit init;
ptr = (ProfilerData*)tracy_malloc( sizeof( ProfilerData ) );
new (ptr) ProfilerData();
profilerData.store( ptr, std::memory_order_release );
@ -1071,7 +1028,6 @@ public:
void* p = pthread_getspecific(m_key);
if (!p)
{
RPMallocInit init;
p = (ProfilerThreadData*)tracy_malloc( sizeof( ProfilerThreadData ) );
new (p) ProfilerThreadData(GetProfilerData());
pthread_setspecific(m_key, p);
@ -1123,18 +1079,12 @@ namespace
# endif
#else
TRACY_API void InitRPMallocThread()
{
rpmalloc_thread_initialize();
}
// MSVC static initialization order solution. gcc/clang uses init_order() to avoid all this.
// 1a. But s_queue is needed for initialization of variables in point 2.
extern moodycamel::ConcurrentQueue<QueueItem> s_queue;
thread_local RPMallocInit init_order(106) s_rpmalloc_thread_init;
// 2. If these variables would be in the .CRT$XCB section, they would be initialized only in main thread.
thread_local moodycamel::ProducerToken init_order(107) s_token_detail( s_queue );
thread_local ProducerWrapper init_order(108) s_token { s_queue.get_explicit_producer( s_token_detail ) };
@ -1147,7 +1097,9 @@ thread_local ThreadHandleWrapper init_order(104) s_threadHandle { detail::GetThr
# endif
static InitTimeWrapper init_order(101) s_initTime { SetupHwTimer() };
static RPMallocInit init_order(102) s_rpmalloc_init;
std::atomic<int> init_order(102) RpInitDone( 0 );
std::atomic<int> init_order(102) RpInitLock( 0 );
thread_local bool RpThreadInitDone = false;
moodycamel::ConcurrentQueue<QueueItem> init_order(103) s_queue( QueuePrealloc );
std::atomic<uint32_t> init_order(104) s_lockCounter( 0 );
std::atomic<uint8_t> init_order(104) s_gpuCtxCounter( 0 );
@ -3613,19 +3565,6 @@ TRACY_API uint64_t ___tracy_alloc_srcloc_name( uint32_t line, const char* source
return tracy::Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
}
// thread_locals are not initialized on thread creation. At least on GNU/Linux. Instead they are
// initialized on their first ODR-use. This means that the allocator is not automagically
// initialized every time a thread is created. As thus, expose to the C API users a simple API to
// call every time they create a thread. Here we can then put all sorts of per-thread
// initialization.
TRACY_API void ___tracy_init_thread(void) {
#ifdef TRACY_DELAYED_INIT
(void)tracy::GetProfilerThreadData();
#else
(void)tracy::s_rpmalloc_thread_init;
#endif
}
#ifdef __cplusplus
}
#endif

View File

@ -63,7 +63,6 @@ TRACY_API std::atomic<uint32_t>& GetLockCounter();
TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter();
TRACY_API GpuCtxWrapper& GetGpuCtx();
TRACY_API uint64_t GetThreadHandle();
TRACY_API void InitRPMallocThread();
TRACY_API bool ProfilerAvailable();
TRACY_API int64_t GetFrequencyQpc();
@ -295,7 +294,6 @@ public:
#endif
if( callstack != 0 )
{
InitRPMallocThread();
tracy::GetProfiler().SendCallstack( callstack );
}
@ -315,7 +313,6 @@ public:
#endif
if( callstack != 0 )
{
InitRPMallocThread();
tracy::GetProfiler().SendCallstack( callstack );
}
@ -333,7 +330,6 @@ public:
#endif
if( callstack != 0 )
{
InitRPMallocThread();
tracy::GetProfiler().SendCallstack( callstack );
}
@ -356,7 +352,6 @@ public:
#endif
if( callstack != 0 )
{
InitRPMallocThread();
tracy::GetProfiler().SendCallstack( callstack );
}
@ -372,7 +367,6 @@ public:
static tracy_force_inline void MessageAppInfo( const char* txt, size_t size )
{
assert( size < std::numeric_limits<uint16_t>::max() );
InitRPMallocThread();
auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, txt, size );
TracyLfqPrepare( QueueType::MessageAppInfo );
@ -423,7 +417,6 @@ public:
# endif
const auto thread = GetThreadHandle();
InitRPMallocThread();
auto callstack = Callstack( depth );
profiler.m_serialLock.lock();
@ -445,7 +438,6 @@ public:
# endif
const auto thread = GetThreadHandle();
InitRPMallocThread();
auto callstack = Callstack( depth );
profiler.m_serialLock.lock();
@ -495,7 +487,6 @@ public:
# endif
const auto thread = GetThreadHandle();
InitRPMallocThread();
auto callstack = Callstack( depth );
profiler.m_serialLock.lock();
@ -518,7 +509,6 @@ public:
# endif
const auto thread = GetThreadHandle();
InitRPMallocThread();
auto callstack = Callstack( depth );
profiler.m_serialLock.lock();

View File

@ -130,7 +130,9 @@
# include <stdio.h>
# include <stdlib.h>
# if defined(__APPLE__)
# include <mach/mach_vm.h>
# if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
# include <mach/mach_vm.h>
# endif
# include <mach/vm_statistics.h>
# include <pthread.h>
# endif

View File

@ -4,15 +4,28 @@
#include <stdlib.h>
#ifdef TRACY_ENABLE
# include "TracyApi.h"
# include "TracyForceInline.hpp"
# include "../client/tracy_rpmalloc.hpp"
#endif
namespace tracy
{
#ifdef TRACY_ENABLE
extern thread_local bool RpThreadInitDone;
TRACY_API void InitRpmallocPlumbing();
static tracy_force_inline void InitRpmalloc()
{
if( !RpThreadInitDone ) InitRpmallocPlumbing();
}
#endif
static inline void* tracy_malloc( size_t size )
{
#ifdef TRACY_ENABLE
InitRpmalloc();
return rpmalloc( size );
#else
return malloc( size );
@ -22,6 +35,7 @@ static inline void* tracy_malloc( size_t size )
static inline void tracy_free( void* ptr )
{
#ifdef TRACY_ENABLE
InitRpmalloc();
rpfree( ptr );
#else
free( ptr );
@ -31,6 +45,7 @@ static inline void tracy_free( void* ptr )
static inline void* tracy_realloc( void* ptr, size_t size )
{
#ifdef TRACY_ENABLE
InitRpmalloc();
return rprealloc( ptr, size );
#else
return realloc( ptr, size );

View File

@ -96,7 +96,6 @@ struct ThreadNameData
ThreadNameData* next;
};
std::atomic<ThreadNameData*>& GetThreadNameData();
TRACY_API void InitRPMallocThread();
#endif
#ifdef _MSC_VER
@ -161,7 +160,6 @@ TRACY_API void SetThreadName( const char* name )
#endif
#ifdef TRACY_ENABLE
{
InitRPMallocThread();
const auto sz = strlen( name );
char* buf = (char*)tracy_malloc( sz+1 );
memcpy( buf, name, sz );

View File

@ -7,7 +7,7 @@
# include <thread>
#endif
#include "../common/TracyForceInline.hpp"
#include "TracyForceInline.hpp"
namespace tracy
{

View File

@ -425,6 +425,33 @@ The application you want to profile should be compiled with all the usual optimi
Finally, on Unix make sure that the application is linked with libraries \texttt{libpthread} and \texttt{libdl}. BSD systems will also need to be linked with \texttt{libexecinfo}.
\begin{bclogo}[
noborder=true,
couleur=black!5,
logo=\bclampe
]{CMake FetchContent}
When using CMake 3.11 or newer, you can use Tracy via CMake FetchContent. In this case, you do not need to manually add a git submodule for Tracy. Add this to your CMakeLists.txt:
\begin{lstlisting}
FetchContent_Declare(
tracy
GIT_REPOSITORY https://github.com/wolfpld/tracy.git
GIT_TAG master
GIT_SHALLOW TRUE
GIT_PROGRESS TRUE
)
FetchContent_MakeAvailable(tracy)
\end{lstlisting}
Then add this to any target where you use tracy for profiling:
\begin{lstlisting}
target_link_libraries(${_target} PUBLIC TracyClient)
\end{lstlisting}
\end{bclogo}
\subsubsection{Short-lived applications}
In case you want to profile a short-lived program (for example, a compression utility that finishes its work in one second), set the \texttt{TRACY\_NO\_EXIT} environment variable to $1$. With this option enabled, Tracy will not exit until an incoming connection is made, even if the application has already finished executing. If your platform doesn't support easy setup of environment variables, you may also add the \texttt{TRACY\_NO\_EXIT} define to your build configuration, which has the same effect.

View File

@ -186,6 +186,7 @@
<ClInclude Include="..\..\..\common\TracySocket.hpp" />
<ClInclude Include="..\..\..\common\TracyStackFrames.hpp" />
<ClInclude Include="..\..\..\common\TracySystem.hpp" />
<ClInclude Include="..\..\..\common\TracyYield.hpp" />
<ClInclude Include="..\..\..\common\tracy_lz4.hpp" />
<ClInclude Include="..\..\..\common\tracy_lz4hc.hpp" />
<ClInclude Include="..\..\..\imgui\imconfig.h" />
@ -236,7 +237,6 @@
<ClInclude Include="..\..\..\server\TracyView.hpp" />
<ClInclude Include="..\..\..\server\TracyViewData.hpp" />
<ClInclude Include="..\..\..\server\TracyWorker.hpp" />
<ClInclude Include="..\..\..\server\TracyYield.hpp" />
<ClInclude Include="..\..\..\server\tracy_pdqsort.h" />
<ClInclude Include="..\..\..\server\tracy_robin_hood.h" />
<ClInclude Include="..\..\..\server\tracy_xxh3.h" />

View File

@ -425,9 +425,6 @@
<ClInclude Include="..\..\..\server\tracy_xxh3.h">
<Filter>server</Filter>
</ClInclude>
<ClInclude Include="..\..\..\server\TracyYield.hpp">
<Filter>server</Filter>
</ClInclude>
<ClInclude Include="..\..\..\server\TracySort.hpp">
<Filter>server</Filter>
</ClInclude>
@ -578,6 +575,9 @@
<ClInclude Include="..\..\..\common\TracyStackFrames.hpp">
<Filter>common</Filter>
</ClInclude>
<ClInclude Include="..\..\..\common\TracyYield.hpp">
<Filter>common</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<Natvis Include="DebugVis.natvis" />

View File

@ -22,7 +22,7 @@
#include "TracyFileHeader.hpp"
#include "TracyMmap.hpp"
#include "TracyYield.hpp"
#include "../common/TracyYield.hpp"
#include "../common/tracy_lz4.hpp"
#include "../common/TracyForceInline.hpp"
#include "../zstd/zstd.h"

View File

@ -26,14 +26,14 @@
#include "../common/TracyProtocol.hpp"
#include "../common/TracySystem.hpp"
#include "../common/TracyYield.hpp"
#include "../common/TracyStackFrames.hpp"
#include "TracyFileRead.hpp"
#include "TracyFileWrite.hpp"
#include "TracySort.hpp"
#include "TracyTaskDispatch.hpp"
#include "TracyVersion.hpp"
#include "TracyWorker.hpp"
#include "TracyYield.hpp"
#include "../common/TracyStackFrames.hpp"
namespace tracy
{