Inline CommandAllocator/Iterator
Inlining these hot functions decreases CPU time in perf tests for
DrawCallPerf.Run/Vulkan by roughly 12% (55 to 47ns) and increases
binary size by about 0.16% (~4kB).
Bug: dawn:304
Change-Id: I84e5d011defe88d6f1492dcb54e421c3d1bf099f
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/14000
Commit-Queue: Austin Eng <enga@chromium.org>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
diff --git a/src/common/Compiler.h b/src/common/Compiler.h
index 3bbcee6..8e425c9 100644
--- a/src/common/Compiler.h
+++ b/src/common/Compiler.h
@@ -61,6 +61,9 @@
# endif
# define DAWN_DECLARE_UNUSED __attribute__((unused))
+# if defined(NDEBUG)
+# define DAWN_FORCE_INLINE inline __attribute__((always_inline))
+# endif
// MSVC
#elif defined(_MSC_VER)
@@ -77,6 +80,9 @@
# endif
# define DAWN_DECLARE_UNUSED
+# if defined(NDEBUG)
+# define DAWN_FORCE_INLINE __forceinline
+# endif
#else
# error "Unsupported compiler"
@@ -97,5 +103,8 @@
#if !defined(DAWN_NO_DISCARD)
# define DAWN_NO_DISCARD
#endif
+#if !defined(DAWN_FORCE_INLINE)
+# define DAWN_FORCE_INLINE inline
+#endif
#endif // COMMON_COMPILER_H_
diff --git a/src/common/Math.cpp b/src/common/Math.cpp
index a8823e5..2ae977b 100644
--- a/src/common/Math.cpp
+++ b/src/common/Math.cpp
@@ -85,13 +85,6 @@
return (reinterpret_cast<size_t>(ptr) & (alignment - 1)) == 0;
}
-void* AlignVoidPtr(void* ptr, size_t alignment) {
- ASSERT(IsPowerOfTwo(alignment));
- ASSERT(alignment != 0);
- return reinterpret_cast<void*>((reinterpret_cast<size_t>(ptr) + (alignment - 1)) &
- ~(alignment - 1));
-}
-
bool IsAligned(uint32_t value, size_t alignment) {
ASSERT(alignment <= UINT32_MAX);
ASSERT(IsPowerOfTwo(alignment));
diff --git a/src/common/Math.h b/src/common/Math.h
index ac40dd9..5ee915e 100644
--- a/src/common/Math.h
+++ b/src/common/Math.h
@@ -15,6 +15,8 @@
#ifndef COMMON_MATH_H_
#define COMMON_MATH_H_
+#include "common/Assert.h"
+
#include <cstddef>
#include <cstdint>
#include <cstring>
@@ -35,13 +37,19 @@
uint32_t Align(uint32_t value, size_t alignment);
template <typename T>
-T* AlignPtr(T* ptr, size_t alignment) {
- return static_cast<T*>(AlignVoidPtr(ptr, alignment));
+DAWN_FORCE_INLINE T* AlignPtr(T* ptr, size_t alignment) {
+ ASSERT(IsPowerOfTwo(alignment));
+ ASSERT(alignment != 0);
+ return reinterpret_cast<T*>((reinterpret_cast<size_t>(ptr) + (alignment - 1)) &
+ ~(alignment - 1));
}
template <typename T>
-const T* AlignPtr(const T* ptr, size_t alignment) {
- return static_cast<const T*>(AlignVoidPtr(const_cast<T*>(ptr), alignment));
+DAWN_FORCE_INLINE const T* AlignPtr(const T* ptr, size_t alignment) {
+ ASSERT(IsPowerOfTwo(alignment));
+ ASSERT(alignment != 0);
+ return reinterpret_cast<const T*>((reinterpret_cast<size_t>(ptr) + (alignment - 1)) &
+ ~(alignment - 1));
}
template <typename destType, typename sourceType>
diff --git a/src/dawn_native/CommandAllocator.cpp b/src/dawn_native/CommandAllocator.cpp
index 990c1c5..553f889 100644
--- a/src/dawn_native/CommandAllocator.cpp
+++ b/src/dawn_native/CommandAllocator.cpp
@@ -23,12 +23,9 @@
namespace dawn_native {
- constexpr uint32_t EndOfBlock = UINT_MAX; // std::numeric_limits<uint32_t>::max();
- constexpr uint32_t AdditionalData = UINT_MAX - 1; // std::numeric_limits<uint32_t>::max() - 1;
-
// TODO(cwallez@chromium.org): figure out a way to have more type safety for the iterator
- CommandIterator::CommandIterator() : mEndOfBlock(EndOfBlock) {
+ CommandIterator::CommandIterator() {
Reset();
}
@@ -42,7 +39,7 @@
}
}
- CommandIterator::CommandIterator(CommandIterator&& other) : mEndOfBlock(EndOfBlock) {
+ CommandIterator::CommandIterator(CommandIterator&& other) {
if (!other.IsEmpty()) {
mBlocks = std::move(other.mBlocks);
other.Reset();
@@ -64,7 +61,7 @@
}
CommandIterator::CommandIterator(CommandAllocator&& allocator)
- : mBlocks(allocator.AcquireBlocks()), mEndOfBlock(EndOfBlock) {
+ : mBlocks(allocator.AcquireBlocks()) {
Reset();
}
@@ -74,6 +71,17 @@
return *this;
}
+ bool CommandIterator::NextCommandIdInNewBlock(uint32_t* commandId) {
+ mCurrentBlock++;
+ if (mCurrentBlock >= mBlocks.size()) {
+ Reset();
+ *commandId = detail::kEndOfBlock;
+ return false;
+ }
+ mCurrentPtr = AlignPtr(mBlocks[mCurrentBlock].block, alignof(uint32_t));
+ return NextCommandId(commandId);
+ }
+
void CommandIterator::Reset() {
mCurrentBlock = 0;
@@ -97,47 +105,6 @@
return mBlocks[0].block == reinterpret_cast<const uint8_t*>(&mEndOfBlock);
}
- bool CommandIterator::NextCommandId(uint32_t* commandId) {
- uint8_t* idPtr = AlignPtr(mCurrentPtr, alignof(uint32_t));
- ASSERT(idPtr + sizeof(uint32_t) <=
- mBlocks[mCurrentBlock].block + mBlocks[mCurrentBlock].size);
-
- uint32_t id = *reinterpret_cast<uint32_t*>(idPtr);
-
- if (id == EndOfBlock) {
- mCurrentBlock++;
- if (mCurrentBlock >= mBlocks.size()) {
- Reset();
- *commandId = EndOfBlock;
- return false;
- }
- mCurrentPtr = AlignPtr(mBlocks[mCurrentBlock].block, alignof(uint32_t));
- return NextCommandId(commandId);
- }
-
- mCurrentPtr = idPtr + sizeof(uint32_t);
- *commandId = id;
- return true;
- }
-
- void* CommandIterator::NextCommand(size_t commandSize, size_t commandAlignment) {
- uint8_t* commandPtr = AlignPtr(mCurrentPtr, commandAlignment);
- ASSERT(commandPtr + sizeof(commandSize) <=
- mBlocks[mCurrentBlock].block + mBlocks[mCurrentBlock].size);
-
- mCurrentPtr = commandPtr + commandSize;
- return commandPtr;
- }
-
- void* CommandIterator::NextData(size_t dataSize, size_t dataAlignment) {
- uint32_t id;
- bool hasId = NextCommandId(&id);
- ASSERT(hasId);
- ASSERT(id == AdditionalData);
-
- return NextCommand(dataSize, dataAlignment);
- }
-
// Potential TODO(cwallez@chromium.org):
// - Host the size and pointer to next block in the block itself to avoid having an allocation
// in the vector
@@ -161,60 +128,23 @@
ASSERT(mCurrentPtr != nullptr && mEndPtr != nullptr);
ASSERT(IsPtrAligned(mCurrentPtr, alignof(uint32_t)));
ASSERT(mCurrentPtr + sizeof(uint32_t) <= mEndPtr);
- *reinterpret_cast<uint32_t*>(mCurrentPtr) = EndOfBlock;
+ *reinterpret_cast<uint32_t*>(mCurrentPtr) = detail::kEndOfBlock;
mCurrentPtr = nullptr;
mEndPtr = nullptr;
return std::move(mBlocks);
}
- uint8_t* CommandAllocator::Allocate(uint32_t commandId,
- size_t commandSize,
- size_t commandAlignment) {
- ASSERT(mCurrentPtr != nullptr);
- ASSERT(mEndPtr != nullptr);
- ASSERT(commandId != EndOfBlock);
-
- // It should always be possible to allocate one id, for EndOfBlock tagging,
- ASSERT(IsPtrAligned(mCurrentPtr, alignof(uint32_t)));
- ASSERT(mEndPtr >= mCurrentPtr);
- ASSERT(static_cast<size_t>(mEndPtr - mCurrentPtr) >= sizeof(uint32_t));
-
- // The memory after the ID will contain the following:
- // - the current ID
- // - padding to align the command, maximum kMaxSupportedAlignment
- // - the command of size commandSize
- // - padding to align the next ID, maximum alignof(uint32_t)
- // - the next ID of size sizeof(uint32_t)
- //
- // To avoid checking for overflows at every step of the computations we compute an upper
- // bound of the space that will be needed in addition to the command data.
- static constexpr size_t kWorstCaseAdditionalSize =
- sizeof(uint32_t) + kMaxSupportedAlignment + alignof(uint32_t) + sizeof(uint32_t);
-
- // This can't overflow because by construction mCurrentPtr always has space for the next ID.
- size_t remainingSize = static_cast<size_t>(mEndPtr - mCurrentPtr);
-
- // The good case were we have enough space for the command data and upper bound of the
- // extra required space.
- if ((remainingSize >= kWorstCaseAdditionalSize) &&
- (remainingSize - kWorstCaseAdditionalSize >= commandSize)) {
- uint32_t* idAlloc = reinterpret_cast<uint32_t*>(mCurrentPtr);
- *idAlloc = commandId;
-
- uint8_t* commandAlloc = AlignPtr(mCurrentPtr + sizeof(uint32_t), commandAlignment);
- mCurrentPtr = AlignPtr(commandAlloc + commandSize, alignof(uint32_t));
-
- return commandAlloc;
- }
-
- // When there is not enough space, we signal the EndOfBlock, so that the iterator knows to
- // move to the next one. EndOfBlock on the last block means the end of the commands.
+ uint8_t* CommandAllocator::AllocateInNewBlock(uint32_t commandId,
+ size_t commandSize,
+ size_t commandAlignment) {
+ // When there is not enough space, we signal the kEndOfBlock, so that the iterator knows
+ // to move to the next one. kEndOfBlock on the last block means the end of the commands.
uint32_t* idAlloc = reinterpret_cast<uint32_t*>(mCurrentPtr);
- *idAlloc = EndOfBlock;
+ *idAlloc = detail::kEndOfBlock;
// We'll request a block that can contain at least the command ID, the command and an
- // additional ID to contain the EndOfBlock tag.
+ // additional ID to contain the kEndOfBlock tag.
size_t requestedBlockSize = commandSize + kWorstCaseAdditionalSize;
// The computation of the request could overflow.
@@ -228,10 +158,6 @@
return Allocate(commandId, commandSize, commandAlignment);
}
- uint8_t* CommandAllocator::AllocateData(size_t commandSize, size_t commandAlignment) {
- return Allocate(AdditionalData, commandSize, commandAlignment);
- }
-
bool CommandAllocator::GetNewBlock(size_t minimumSize) {
// Allocate blocks doubling sizes each time, to a maximum of 16k (or at least minimumSize).
mLastAllocationSize =
diff --git a/src/dawn_native/CommandAllocator.h b/src/dawn_native/CommandAllocator.h
index 504ba7a..82de05c 100644
--- a/src/dawn_native/CommandAllocator.h
+++ b/src/dawn_native/CommandAllocator.h
@@ -15,6 +15,9 @@
#ifndef DAWNNATIVE_COMMAND_ALLOCATOR_H_
#define DAWNNATIVE_COMMAND_ALLOCATOR_H_
+#include "common/Assert.h"
+#include "common/Math.h"
+
#include <cstddef>
#include <cstdint>
#include <vector>
@@ -56,6 +59,11 @@
};
using CommandBlocks = std::vector<BlockDef>;
+ namespace detail {
+ constexpr uint32_t kEndOfBlock = std::numeric_limits<uint32_t>::max();
+ constexpr uint32_t kAdditionalData = std::numeric_limits<uint32_t>::max() - 1;
+ } // namespace detail
+
class CommandAllocator;
// TODO(cwallez@chromium.org): prevent copy for both iterator and allocator
@@ -91,15 +99,46 @@
private:
bool IsEmpty() const;
- bool NextCommandId(uint32_t* commandId);
- void* NextCommand(size_t commandSize, size_t commandAlignment);
- void* NextData(size_t dataSize, size_t dataAlignment);
+ DAWN_FORCE_INLINE bool NextCommandId(uint32_t* commandId) {
+ uint8_t* idPtr = AlignPtr(mCurrentPtr, alignof(uint32_t));
+ ASSERT(idPtr + sizeof(uint32_t) <=
+ mBlocks[mCurrentBlock].block + mBlocks[mCurrentBlock].size);
+
+ uint32_t id = *reinterpret_cast<uint32_t*>(idPtr);
+
+ if (id != detail::kEndOfBlock) {
+ mCurrentPtr = idPtr + sizeof(uint32_t);
+ *commandId = id;
+ return true;
+ }
+ return NextCommandIdInNewBlock(commandId);
+ }
+
+ bool NextCommandIdInNewBlock(uint32_t* commandId);
+
+ DAWN_FORCE_INLINE void* NextCommand(size_t commandSize, size_t commandAlignment) {
+ uint8_t* commandPtr = AlignPtr(mCurrentPtr, commandAlignment);
+ ASSERT(commandPtr + sizeof(commandSize) <=
+ mBlocks[mCurrentBlock].block + mBlocks[mCurrentBlock].size);
+
+ mCurrentPtr = commandPtr + commandSize;
+ return commandPtr;
+ }
+
+ DAWN_FORCE_INLINE void* NextData(size_t dataSize, size_t dataAlignment) {
+ uint32_t id;
+ bool hasId = NextCommandId(&id);
+ ASSERT(hasId);
+ ASSERT(id == detail::kAdditionalData);
+
+ return NextCommand(dataSize, dataAlignment);
+ }
CommandBlocks mBlocks;
uint8_t* mCurrentPtr = nullptr;
size_t mCurrentBlock = 0;
// Used to avoid a special case for empty iterators.
- uint32_t mEndOfBlock;
+ uint32_t mEndOfBlock = detail::kEndOfBlock;
bool mDataWasDestroyed = false;
};
@@ -140,18 +179,67 @@
// using the CommandAllocator passes the static_asserts.
static constexpr size_t kMaxSupportedAlignment = 8;
+ // To avoid checking for overflows at every step of the computations we compute an upper
+ // bound of the space that will be needed in addition to the command data.
+ static constexpr size_t kWorstCaseAdditionalSize =
+ sizeof(uint32_t) + kMaxSupportedAlignment + alignof(uint32_t) + sizeof(uint32_t);
+
friend CommandIterator;
CommandBlocks&& AcquireBlocks();
- uint8_t* Allocate(uint32_t commandId, size_t commandSize, size_t commandAlignment);
- uint8_t* AllocateData(size_t dataSize, size_t dataAlignment);
+ DAWN_FORCE_INLINE uint8_t* Allocate(uint32_t commandId,
+ size_t commandSize,
+ size_t commandAlignment) {
+ ASSERT(mCurrentPtr != nullptr);
+ ASSERT(mEndPtr != nullptr);
+ ASSERT(commandId != detail::kEndOfBlock);
+
+ // It should always be possible to allocate one id, for kEndOfBlock tagging,
+ ASSERT(IsPtrAligned(mCurrentPtr, alignof(uint32_t)));
+ ASSERT(mEndPtr >= mCurrentPtr);
+ ASSERT(static_cast<size_t>(mEndPtr - mCurrentPtr) >= sizeof(uint32_t));
+
+ // The memory after the ID will contain the following:
+ // - the current ID
+ // - padding to align the command, maximum kMaxSupportedAlignment
+ // - the command of size commandSize
+ // - padding to align the next ID, maximum alignof(uint32_t)
+ // - the next ID of size sizeof(uint32_t)
+
+ // This can't overflow because by construction mCurrentPtr always has space for the next
+ // ID.
+ size_t remainingSize = static_cast<size_t>(mEndPtr - mCurrentPtr);
+
+ // The good case were we have enough space for the command data and upper bound of the
+ // extra required space.
+ if ((remainingSize >= kWorstCaseAdditionalSize) &&
+ (remainingSize - kWorstCaseAdditionalSize >= commandSize)) {
+ uint32_t* idAlloc = reinterpret_cast<uint32_t*>(mCurrentPtr);
+ *idAlloc = commandId;
+
+ uint8_t* commandAlloc = AlignPtr(mCurrentPtr + sizeof(uint32_t), commandAlignment);
+ mCurrentPtr = AlignPtr(commandAlloc + commandSize, alignof(uint32_t));
+
+ return commandAlloc;
+ }
+ return AllocateInNewBlock(commandId, commandSize, commandAlignment);
+ }
+
+ uint8_t* AllocateInNewBlock(uint32_t commandId,
+ size_t commandSize,
+ size_t commandAlignment);
+
+ DAWN_FORCE_INLINE uint8_t* AllocateData(size_t commandSize, size_t commandAlignment) {
+ return Allocate(detail::kAdditionalData, commandSize, commandAlignment);
+ }
+
bool GetNewBlock(size_t minimumSize);
CommandBlocks mBlocks;
size_t mLastAllocationSize = 2048;
// Pointers to the current range of allocation in the block. Guaranteed to allow for at
- // least one uint32_t if not nullptr, so that the special EndOfBlock command id can always
+ // least one uint32_t if not nullptr, so that the special kEndOfBlock command id can always
// be written. Nullptr iff the blocks were moved out.
uint8_t* mCurrentPtr = nullptr;
uint8_t* mEndPtr = nullptr;