materialsystem: threaded optimizations, fix mat_queue_mode on some android devices

This commit is contained in:
nillerusr 2023-01-14 14:48:22 +03:00
parent 3493fe9b0e
commit 8f1156442e
8 changed files with 66 additions and 140 deletions

View file

@ -1029,7 +1029,7 @@ bool CMaterialSystem::AllowThreading( bool bAllow, int nServiceThread )
bool bOldAllow = m_bAllowQueuedRendering; bool bOldAllow = m_bAllowQueuedRendering;
if ( GetCPUInformation()->m_nPhysicalProcessors >= 2 ) if ( GetCPUInformation()->m_nLogicalProcessors >= 2 )
{ {
m_bAllowQueuedRendering = bAllow; m_bAllowQueuedRendering = bAllow;
bool bQueued = m_IdealThreadMode != MATERIAL_SINGLE_THREADED; bool bQueued = m_IdealThreadMode != MATERIAL_SINGLE_THREADED;
@ -1806,11 +1806,7 @@ static ConVar mat_normalmaps( "mat_normalmaps", "0", FCVAR_CHEAT );
static ConVar mat_measurefillrate( "mat_measurefillrate", "0", FCVAR_CHEAT ); static ConVar mat_measurefillrate( "mat_measurefillrate", "0", FCVAR_CHEAT );
static ConVar mat_fillrate( "mat_fillrate", "0", FCVAR_CHEAT ); static ConVar mat_fillrate( "mat_fillrate", "0", FCVAR_CHEAT );
static ConVar mat_reversedepth( "mat_reversedepth", "0", FCVAR_CHEAT ); static ConVar mat_reversedepth( "mat_reversedepth", "0", FCVAR_CHEAT );
#ifdef DX_TO_GL_ABSTRACTION
static ConVar mat_bufferprimitives( "mat_bufferprimitives", "0" ); // I'm not seeing any benefit speed wise for buffered primitives on GLM/POSIX (checked via TF2 timedemo) - default to zero
#else
static ConVar mat_bufferprimitives( "mat_bufferprimitives", "1" ); static ConVar mat_bufferprimitives( "mat_bufferprimitives", "1" );
#endif
static ConVar mat_drawflat( "mat_drawflat","0", FCVAR_CHEAT ); static ConVar mat_drawflat( "mat_drawflat","0", FCVAR_CHEAT );
static ConVar mat_softwarelighting( "mat_softwarelighting", "0", FCVAR_ALLOWED_IN_COMPETITIVE ); static ConVar mat_softwarelighting( "mat_softwarelighting", "0", FCVAR_ALLOWED_IN_COMPETITIVE );
static ConVar mat_proxy( "mat_proxy", "0", FCVAR_CHEAT, "", MatProxyCallback ); static ConVar mat_proxy( "mat_proxy", "0", FCVAR_CHEAT, "", MatProxyCallback );
@ -2780,8 +2776,8 @@ IMaterial* CMaterialSystem::FindMaterialEx( char const* pMaterialName, const cha
{ {
// We need lower-case symbols for this to work // We need lower-case symbols for this to work
int nLen = Q_strlen( pMaterialName ) + 1; int nLen = Q_strlen( pMaterialName ) + 1;
char *pFixedNameTemp = (char*)malloc( nLen ); char *pFixedNameTemp = (char*)stackalloc( nLen );
char *pTemp = (char*)malloc( nLen ); char *pTemp = (char*)stackalloc( nLen );
Q_strncpy( pFixedNameTemp, pMaterialName, nLen ); Q_strncpy( pFixedNameTemp, pMaterialName, nLen );
Q_strlower( pFixedNameTemp ); Q_strlower( pFixedNameTemp );
#ifdef POSIX #ifdef POSIX
@ -2883,9 +2879,6 @@ IMaterial* CMaterialSystem::FindMaterialEx( char const* pMaterialName, const cha
} }
} }
free(pTemp);
free(pFixedNameTemp);
return g_pErrorMaterial->GetRealTimeVersion(); return g_pErrorMaterial->GetRealTimeVersion();
} }
@ -3103,20 +3096,12 @@ void CMaterialSystem::ResetTempHWMemory( bool bExitingLevel )
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
void CMaterialSystem::CacheUsedMaterials( ) void CMaterialSystem::CacheUsedMaterials( )
{ {
printf("Cache materials\n");
g_pShaderAPI->EvictManagedResources(); g_pShaderAPI->EvictManagedResources();
size_t count = 0;
for (MaterialHandle_t i = FirstMaterial(); i != InvalidMaterial(); i = NextMaterial(i) ) for (MaterialHandle_t i = FirstMaterial(); i != InvalidMaterial(); i = NextMaterial(i) )
{ {
// Some (mac) drivers (amd) seem to keep extra resources around on uploads until the next frame swap. This
// injects pointless synthetic swaps (between already-static load frames)
if ( mat_texture_reload_frame_swap_workaround.GetBool() )
{
if ( count++ % 20 == 0 )
{
Flush(true);
SwapBuffers(); // Not the right thing to call
}
}
IMaterialInternal* pMat = GetMaterialInternal(i); IMaterialInternal* pMat = GetMaterialInternal(i);
Assert( pMat->GetReferenceCount() >= 0 ); Assert( pMat->GetReferenceCount() >= 0 );
if( pMat->GetReferenceCount() > 0 ) if( pMat->GetReferenceCount() > 0 )
@ -3703,9 +3688,13 @@ void CMaterialSystem::EndFrame( void )
ThreadAcquire( true ); ThreadAcquire( true );
} }
IThreadPool* pThreadPool = CreateMatQueueThreadPool();
if ( m_pActiveAsyncJob && !m_pActiveAsyncJob->IsFinished() ) if ( m_pActiveAsyncJob && !m_pActiveAsyncJob->IsFinished() )
{ {
m_pActiveAsyncJob->WaitForFinish(); m_pActiveAsyncJob->WaitForFinish(TT_INFINITE, pThreadPool);
// Sync with GPU if we had a job for it, even if it finished early on CPU!
if ( !IsPC() && g_config.ForceHWSync() ) if ( !IsPC() && g_config.ForceHWSync() )
{ {
g_pShaderAPI->ForceHardwareSync(); g_pShaderAPI->ForceHardwareSync();
@ -3730,7 +3719,6 @@ void CMaterialSystem::EndFrame( void )
} }
} }
IThreadPool *pThreadPool = CreateMatQueueThreadPool();
pThreadPool->AddJob( m_pActiveAsyncJob ); pThreadPool->AddJob( m_pActiveAsyncJob );
break; break;
} }
@ -4664,20 +4652,9 @@ void CMaterialSystem::BeginRenderTargetAllocation( void )
void CMaterialSystem::EndRenderTargetAllocation( void ) void CMaterialSystem::EndRenderTargetAllocation( void )
{ {
// Any GPU newer than 2005 doesn't need to do this, and it eats up ~40% of our level load time!
const bool cbRequiresRenderTargetAllocationFirst = mat_requires_rt_alloc_first.GetBool();
g_pShaderAPI->FlushBufferedPrimitives(); g_pShaderAPI->FlushBufferedPrimitives();
m_bAllocatingRenderTargets = false; m_bAllocatingRenderTargets = false;
if ( IsPC() && cbRequiresRenderTargetAllocationFirst && g_pShaderAPI->CanDownloadTextures() )
{
// Simulate an Alt-Tab...will cause RTs to be allocated first
g_pShaderDevice->ReleaseResources();
g_pShaderDevice->ReacquireResources();
}
TextureManager()->CacheExternalStandardRenderTargets(); TextureManager()->CacheExternalStandardRenderTargets();
} }

View file

@ -455,14 +455,11 @@ public:
} }
else else
{ {
ALIGN16 uint16 tempIndices[16]; static ALIGN16 uint16 tempIndices[256];
// original method
int i = 0; int i = 0;
if ( (size_t)desc.m_pIndices % 4 == 2 )
{
desc.m_pIndices[i] = pIndexData[i] + desc.m_nFirstVertex;
i++;
}
while ( i < nIndices ) while ( i < nIndices )
{ {
int nToCopy = min( (int)ARRAYSIZE(tempIndices), nIndices - i ); int nToCopy = min( (int)ARRAYSIZE(tempIndices), nIndices - i );

View file

@ -2458,15 +2458,8 @@ bool CTexture::AsyncReadTextureFromFile( IVTFTexture* pVTFTexture, unsigned int
return false; return false;
} }
if ( V_strstr( GetName(), "c_sniperrifle_scope" ) )
{
int i = 0;
i = 3;
}
tmZone( TELEMETRY_LEVEL0, TMZF_NONE, "%s - %s", __FUNCTION__, tmDynamicString( TELEMETRY_LEVEL0, pCacheFileName ) ); tmZone( TELEMETRY_LEVEL0, TMZF_NONE, "%s - %s", __FUNCTION__, tmDynamicString( TELEMETRY_LEVEL0, pCacheFileName ) );
// OSX hackery // OSX hackery
int nPreserveFlags = nAdditionalCreationFlags; int nPreserveFlags = nAdditionalCreationFlags;
if ( m_nFlags & TEXTUREFLAGS_SRGB ) if ( m_nFlags & TEXTUREFLAGS_SRGB )
@ -4189,12 +4182,6 @@ bool SLoadTextureBitsFromFile( IVTFTexture **ppOutVtfTexture, FileHandle_t hFile
// NOTE! NOTE! NOTE! or by the streaming texture code! // NOTE! NOTE! NOTE! or by the streaming texture code!
Assert( ppOutVtfTexture != NULL && *ppOutVtfTexture != NULL ); Assert( ppOutVtfTexture != NULL && *ppOutVtfTexture != NULL );
if ( V_strstr( pName, "c_rocketlauncher/c_rocketlauncher" ) )
{
int i = 0;
i = 3;
}
CUtlBuffer buf; CUtlBuffer buf;
{ {

View file

@ -52,6 +52,12 @@
#pragma once #pragma once
#pragma warning(push) #pragma warning(push)
#pragma warning(disable:4251) #pragma warning(disable:4251)
extern "C"
{
void __declspec(dllimport) __stdcall Sleep( unsigned long );
}
#endif #endif
#ifdef COMPILER_MSVC64 #ifdef COMPILER_MSVC64
@ -194,8 +200,6 @@ PLATFORM_INTERFACE bool ReleaseThreadHandle( ThreadHandle_t );
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
PLATFORM_INTERFACE void ThreadSleep(unsigned duration = 0);
PLATFORM_INTERFACE void ThreadNanoSleep(unsigned ns);
PLATFORM_INTERFACE ThreadId_t ThreadGetCurrentId(); PLATFORM_INTERFACE ThreadId_t ThreadGetCurrentId();
PLATFORM_INTERFACE ThreadHandle_t ThreadGetCurrentHandle(); PLATFORM_INTERFACE ThreadHandle_t ThreadGetCurrentHandle();
PLATFORM_INTERFACE int ThreadGetPriority( ThreadHandle_t hThread = NULL ); PLATFORM_INTERFACE int ThreadGetPriority( ThreadHandle_t hThread = NULL );
@ -229,10 +233,10 @@ inline void ThreadPause()
{ {
#if defined( COMPILER_PS3 ) #if defined( COMPILER_PS3 )
__db16cyc(); __db16cyc();
#elif defined(__arm__) || defined(__aarch64__) #elif defined( COMPILER_GCC ) && (defined( __i386__ ) || defined( __x86_64__ ))
sched_yield();
#elif defined( COMPILER_GCC )
__asm __volatile( "pause" ); __asm __volatile( "pause" );
#elif defined( POSIX )
sched_yield();
#elif defined ( COMPILER_MSVC64 ) #elif defined ( COMPILER_MSVC64 )
_mm_pause(); _mm_pause();
#elif defined( COMPILER_MSVC32 ) #elif defined( COMPILER_MSVC32 )
@ -247,6 +251,36 @@ inline void ThreadPause()
#endif #endif
} }
inline void ThreadSleep(unsigned nMilliseconds = 0)
{
if( nMilliseconds == 0 )
{
ThreadPause();
return;
}
#ifdef _WIN32
#ifdef _WIN32_PC
static bool bInitialized = false;
if ( !bInitialized )
{
bInitialized = true;
// Set the timer resolution to 1 ms (default is 10.0, 15.6, 2.5, 1.0 or
// some other value depending on hardware and software) so that we can
// use Sleep( 1 ) to avoid wasting CPU time without missing our frame
// rate.
timeBeginPeriod( 1 );
}
#endif
Sleep( nMilliseconds );
#elif PS3
sys_timer_usleep( nMilliseconds * 1000 );
#elif defined(POSIX)
usleep( nMilliseconds * 1000 );
#endif
}
PLATFORM_INTERFACE bool ThreadJoin( ThreadHandle_t, unsigned timeout = TT_INFINITE ); PLATFORM_INTERFACE bool ThreadJoin( ThreadHandle_t, unsigned timeout = TT_INFINITE );
PLATFORM_INTERFACE void ThreadSetDebugName( ThreadHandle_t hThread, const char *pszName ); PLATFORM_INTERFACE void ThreadSetDebugName( ThreadHandle_t hThread, const char *pszName );

View file

@ -11,21 +11,15 @@ namespace memutils
template<typename T> template<typename T>
inline void copy( T *dest, const T *src, size_t n ) inline void copy( T *dest, const T *src, size_t n )
{ {
do for(; n; n--)
{ *(dest++) = *(src++);
--n;
*(dest+n) = *(src+n);
} while( n );
} }
template<typename T> template<typename T>
inline void set( T *dest, T value, size_t n ) inline void set( T *dest, const T& value, size_t n )
{ {
do for(; n; n--)
{ *(dest++) = value;
--n;
*(dest+n) = value;
} while( n );
} }
} }

View file

@ -492,8 +492,8 @@ public:
//----------------------------------------------------- //-----------------------------------------------------
// Thread event support (safe for NULL this to simplify code ) // Thread event support (safe for NULL this to simplify code )
//----------------------------------------------------- //-----------------------------------------------------
bool WaitForFinish( uint32 dwTimeout = TT_INFINITE ) { if (!this) return true; return ( !IsFinished() ) ? g_pThreadPool->YieldWait( this, dwTimeout ) : true; } inline bool WaitForFinish( uint32 dwTimeout = TT_INFINITE, IThreadPool *pool = g_pThreadPool ) { if (!this) return true; return ( !IsFinished() ) ? pool->YieldWait( this, dwTimeout ) : true; }
bool WaitForFinishAndRelease( uint32 dwTimeout = TT_INFINITE ) { if (!this) return true; bool bResult = WaitForFinish( dwTimeout); Release(); return bResult; } inline bool WaitForFinishAndRelease( uint32 dwTimeout = TT_INFINITE ) { if (!this) return true; bool bResult = WaitForFinish( dwTimeout); Release(); return bResult; }
CThreadEvent *AccessEvent() { return &m_CompleteEvent; } CThreadEvent *AccessEvent() { return &m_CompleteEvent; }
//----------------------------------------------------- //-----------------------------------------------------

View file

@ -485,59 +485,6 @@ bool ReleaseThreadHandle( ThreadHandle_t hThread )
// //
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
void ThreadSleep(unsigned nMilliseconds)
{
#ifdef _WIN32
#ifdef _WIN32_PC
static bool bInitialized = false;
if ( !bInitialized )
{
bInitialized = true;
// Set the timer resolution to 1 ms (default is 10.0, 15.6, 2.5, 1.0 or
// some other value depending on hardware and software) so that we can
// use Sleep( 1 ) to avoid wasting CPU time without missing our frame
// rate.
timeBeginPeriod( 1 );
}
#endif
Sleep( nMilliseconds );
#elif PS3
if( nMilliseconds == 0 )
{
// sys_ppu_thread_yield doesn't seem to function properly, so sleep instead.
// sys_timer_usleep( 60 );
sys_ppu_thread_yield();
}
else
{
sys_timer_usleep( nMilliseconds * 1000 );
}
#elif defined(POSIX)
usleep( nMilliseconds * 1000 );
#endif
}
//-----------------------------------------------------------------------------
void ThreadNanoSleep(unsigned ns)
{
#ifdef _WIN32
// ceil
Sleep( ( ns + 999 ) / 1000 );
#elif PS3
sys_timer_usleep( ns );
#elif defined(POSIX)
struct timespec tm;
tm.tv_sec = 0;
tm.tv_nsec = ns;
nanosleep( &tm, NULL );
#endif
}
//-----------------------------------------------------------------------------
#ifndef ThreadGetCurrentId #ifndef ThreadGetCurrentId
ThreadId_t ThreadGetCurrentId() ThreadId_t ThreadGetCurrentId()
{ {

View file

@ -214,7 +214,11 @@ public:
//----------------------------------------------------- //-----------------------------------------------------
virtual int YieldWait( CThreadEvent **pEvents, int nEvents, bool bWaitAll = true, unsigned timeout = TT_INFINITE ); virtual int YieldWait( CThreadEvent **pEvents, int nEvents, bool bWaitAll = true, unsigned timeout = TT_INFINITE );
virtual int YieldWait( CJob **, int nJobs, bool bWaitAll = true, unsigned timeout = TT_INFINITE ); virtual int YieldWait( CJob **, int nJobs, bool bWaitAll = true, unsigned timeout = TT_INFINITE );
void Yield( unsigned timeout ); inline void Yield( unsigned timeout )
{
Assert( ThreadInMainThread() );
ThreadSleep( timeout );
}
//----------------------------------------------------- //-----------------------------------------------------
// Add a native job to the queue (master thread) // Add a native job to the queue (master thread)
@ -656,20 +660,6 @@ int CThreadPool::YieldWait( CJob **ppJobs, int nJobs, bool bWaitAll, unsigned ti
return YieldWait( handles.Base(), handles.Count(), bWaitAll, timeout); return YieldWait( handles.Base(), handles.Count(), bWaitAll, timeout);
} }
//---------------------------------------------------------
void CThreadPool::Yield( unsigned timeout )
{
// @MULTICORE (toml 10/24/2006): not implemented
Assert( ThreadInMainThread() );
if ( !ThreadInMainThread() )
{
ThreadSleep( timeout );
return;
}
ThreadSleep( timeout );
}
//--------------------------------------------------------- //---------------------------------------------------------
// Add a job to the queue // Add a job to the queue
//--------------------------------------------------------- //---------------------------------------------------------