Optimized cmatqueuedrendercontext

2024-08-29 08:59:44 +02:00 · 2024-08-29 08:59:44 +02:00 · 494411928e
commit 494411928e
parent 651e3ca823
5 changed files with 86 additions and 24 deletions
--- a/materialsystem/cmatqueuedrendercontext.cpp
+++ b/materialsystem/cmatqueuedrendercontext.cpp
@ -367,32 +367,61 @@ public:
 			}

 			Assert( pDest );
-			memcpy( (byte *)pDest, pVertexData, nBytesVerts );
+
+            auto pAlignedDest = assume_aligned<16>((byte*)pDest);
+            auto pAlignedVertexData = assume_aligned<16>(pVertexData);
+
+            for (int i = 0; i < nBytesVerts; i++)
+            {
+                pAlignedDest[i] = pAlignedVertexData[i];
+            }
 		}

 		if ( pIndexData && pIndexData != &gm_ScratchIndexBuffer[0] && desc.m_nIndexSize )
-		{
+        {
+            static constexpr auto INDICES_TO_AUTOVECTORIZE = 256;
+            int i                                          = 0;
+            auto pAlignedIndexData = assume_aligned<16>(pIndexData);
+            auto pAlignedIndces = (((size_t)desc.m_pIndices) % 16) == 0 ? assume_aligned<16>(desc.m_pIndices) : desc.m_pIndices;
+
 			if ( !desc.m_nFirstVertex )
 			{
 				// AssertMsg(desc.m_pIndices & 0x03 == 0,"desc.m_pIndices is misaligned in CMatQueuedMesh::ExecuteDefferedBuild\n");
 				// memcpy( (byte *)desc.m_pIndices, (byte *)pIndexData, nIndices * sizeof(*pIndexData) );

 				// Let it autovectorize.
-				for (int i = 0; i < nIndices; i++)
+				while ( i < nIndices )
 				{
-					desc.m_pIndices[i] = pIndexData[i];
+					int nToCopy = nIndices - i;
+
+					auto pSrc = pAlignedIndexData + i;
+					auto pDst = pAlignedIndces + i;
+
+					if (nToCopy < INDICES_TO_AUTOVECTORIZE)
+					{
+						for (int j = 0; j < nToCopy; j++)
+						{
+							pDst[j] = pSrc[j];
+						}
+
+						i += nToCopy;
+					}
+					else
+					{
+						for (int j = 0; j < INDICES_TO_AUTOVECTORIZE; j++)
+						{
+							pDst[j] = pSrc[j];
+						}
+
+						i += INDICES_TO_AUTOVECTORIZE;
+					}
 				}
 			}
 			else
 			{
-				static constexpr auto INDICES_TO_AUTOVECTORIZE = 256;
-
-				// original method
-				int i = 0;
-
 				int firstVertex = desc.m_nFirstVertex;
 				
-				uint16 DECL_ALIGN(512) firstVertexMatrix[INDICES_TO_AUTOVECTORIZE];
+				uint16 firstVertexMatrix alignas(16) [INDICES_TO_AUTOVECTORIZE];

 				for (int i = 0; i < INDICES_TO_AUTOVECTORIZE; i++)
 				{
@ -402,9 +431,8 @@ public:
 				while ( i < nIndices )
 				{
 					int nToCopy = nIndices - i;
-
-					auto pSrc = pIndexData + i;
-					auto pDst = desc.m_pIndices + i;
+					auto pSrc = pAlignedIndexData + i;
+					auto pDst = pAlignedIndces + i;

 					if (nToCopy < INDICES_TO_AUTOVECTORIZE)
 					{
@ -641,10 +669,10 @@ private:
 	IMesh *m_pVertexOverride;
 	IMesh *m_pIndexOverride;

-	static unsigned short gm_ScratchIndexBuffer[6];
+	alignas(16) static unsigned short gm_ScratchIndexBuffer [6];
 };

-unsigned short CMatQueuedMesh::gm_ScratchIndexBuffer[6];
+alignas(16) unsigned short CMatQueuedMesh::gm_ScratchIndexBuffer[6];

 //-----------------------------------------------------------------------------
 // 
--- a/materialsystem/mat_stub.cpp
+++ b/materialsystem/mat_stub.cpp
@ -27,7 +27,7 @@ CDummyTexture g_DummyTexture;
 // ---------------------------------------------------------------------------------------- //
 // IMaterialSystem and IMesh stub classes.
 // ---------------------------------------------------------------------------------------- //
-static unsigned short g_DummyIndices[6];
+alignas(16) static unsigned short g_DummyIndices[6];

 class CDummyMesh : public IMesh
 {
--- a/materialsystem/shaderapidx9/dynamicib.h
+++ b/materialsystem/shaderapidx9/dynamicib.h
@ -7,6 +7,7 @@
 #ifndef DYNAMICIB_H
 #define DYNAMICIB_H

+#include "memalloc.h"
 #ifdef _WIN32
 #pragma once
 #endif
@ -310,7 +311,7 @@ inline CIndexBuffer::CIndexBuffer( IDirect3DDevice9 *pD3D, int count,

 	if ( g_pShaderUtil->GetThreadMode() != MATERIAL_SINGLE_THREADED || !ThreadInMainThread() )
 	{
-		m_pSysmemBuffer = ( byte * )malloc( count * IndexSize() );
+		m_pSysmemBuffer = ( byte * )_aligned_malloc( count * IndexSize(), 16 );
 		m_nSysmemBufferStartBytes = 0;
 	}
 	else
@ -509,7 +510,7 @@ inline CIndexBuffer::~CIndexBuffer()

 	if ( m_pSysmemBuffer )
 	{
-		free( m_pSysmemBuffer );
+		_aligned_free( m_pSysmemBuffer );
 		m_pSysmemBuffer = NULL;
 	}

@ -800,7 +801,7 @@ inline unsigned short* CIndexBuffer::Lock( bool bReadOnly, int numIndices, int&
 #			endif
 			m_bFlush = false;
 		}
-#endif
+    #endif
 	}
 	else
 	{
@ -829,12 +830,13 @@ inline unsigned short* CIndexBuffer::Lock( bool bReadOnly, int numIndices, int&

 	HRESULT hr = D3D_OK;

+    static ConVar cl_materialsystem_ignore_thread_safe("cl_materialsystem_ignore_thread_safe", "0");
 #if !defined( _X360 )
 	// If the caller isn't in the thread that owns the render lock, need to return a system memory pointer--cannot talk to GL from 
 	// the non-current thread. 
-	if ( !m_pSysmemBuffer && !g_pShaderUtil->IsRenderThreadSafe() )
+	if ( !m_pSysmemBuffer && (!g_pShaderUtil->IsRenderThreadSafe() || cl_materialsystem_ignore_thread_safe.GetBool()) )
 	{
-		m_pSysmemBuffer = ( byte * )malloc( m_IndexCount * IndexSize() );
+		m_pSysmemBuffer = ( byte * )_aligned_malloc( m_IndexCount * IndexSize(), 16 );
 		m_nSysmemBufferStartBytes = position * IndexSize();
 	}

@ -1022,7 +1024,7 @@ inline void CIndexBuffer::HandleLateCreation( )
 	memcpy( pWritePtr, m_pSysmemBuffer + m_nSysmemBufferStartBytes, dataToWriteBytes );
 	ReallyUnlock( dataToWriteBytes );

-	free( m_pSysmemBuffer );
+	_aligned_free( m_pSysmemBuffer );
 	m_pSysmemBuffer = NULL;
 }

--- a/materialsystem/shaderapidx9/meshdx8.cpp
+++ b/materialsystem/shaderapidx9/meshdx8.cpp
@ -57,7 +57,7 @@ extern ConVar mat_debugalttab;

 //#define DRAW_SELECTION 1
 static bool g_bDrawSelection = true;	// only used in DRAW_SELECTION 
-static unsigned short g_nScratchIndexBuffer[6]; // large enough for a fast quad; used when device is not active
+alignas(16) static unsigned short g_nScratchIndexBuffer[6]; // large enough for a fast quad; used when device is not active
 #ifdef _DEBUG
 int CVertexBuffer::s_BufferCount = 0;
 int CIndexBuffer::s_BufferCount = 0;
@ -627,7 +627,7 @@ private:
 	CDynamicMeshDX8 *GetDynamicMesh();

 	CUtlVector< unsigned char, CUtlMemoryAligned< unsigned char, 32 > > m_VertexData;
-	CUtlVector< unsigned short > m_IndexData;
+	CUtlVector< unsigned short, CUtlMemoryAligned< unsigned short, 16 > > m_IndexData;

 	unsigned short m_VertexSize;
 	MaterialPrimitiveType_t m_Type;
--- a/public/tier0/platform.h
+++ b/public/tier0/platform.h
@ -540,6 +540,38 @@ typedef void * HINSTANCE;
 #error
 #endif

+template <std::size_t N, typename T>
+#if defined(__clang__) || defined(__GNUC__)
+__attribute__((always_inline))
+#elif defined(_MSC_VER)
+__forceinline
+#endif
+[[nodiscard]] constexpr T* assume_aligned(T* ptr) noexcept
+{
+#if defined(__clang__) || (defined(__GNUC__) && !defined(__ICC))
+return reinterpret_cast<T*>(__builtin_assume_aligned(ptr, N));
+#elif defined(_MSC_VER)
+if ((reinterpret_cast<std::uintptr_t>(ptr) & ((1 << N) - 1)) == 0)
+return ptr;
+else
+__assume(0);
+#elif defined(__ICC)
+switch (N) {
+case 2: __assume_aligned(ptr, 2); break;
+case 4: __assume_aligned(ptr, 4); break;
+case 8: __assume_aligned(ptr, 8); break;
+case 16: __assume_aligned(ptr, 16); break;
+case 32: __assume_aligned(ptr, 32); break;
+case 64: __assume_aligned(ptr, 64); break;
+case 128: __assume_aligned(ptr, 128); break;
+}
+return ptr;
+#else
+// Unknown compiler — do nothing
+return ptr;
+#endif
+}
+
 // !!! NOTE: if you get a compile error here, you are using VALIGNOF on an abstract type :NOTE !!!
 #define VALIGNOF_PORTABLE( type ) ( sizeof( AlignOf_t<type> ) - sizeof( type ) )