Inline CommandAllocator/Iterator

Inlining these hot functions decreases CPU time in perf tests for
DrawCallPerf.Run/Vulkan by roughly 12% (55 to 47ns) and increases
binary size by about 0.16% (~4kB).

Bug: dawn:304
Change-Id: I84e5d011defe88d6f1492dcb54e421c3d1bf099f
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/14000
Commit-Queue: Austin Eng <enga@chromium.org>
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
diff --git a/src/common/Compiler.h b/src/common/Compiler.h
index 3bbcee6..8e425c9 100644
--- a/src/common/Compiler.h
+++ b/src/common/Compiler.h
@@ -61,6 +61,9 @@
 #    endif
 
 #    define DAWN_DECLARE_UNUSED __attribute__((unused))
+#    if defined(NDEBUG)
+#        define DAWN_FORCE_INLINE inline __attribute__((always_inline))
+#    endif
 
 // MSVC
 #elif defined(_MSC_VER)
@@ -77,6 +80,9 @@
 #    endif
 
 #    define DAWN_DECLARE_UNUSED
+#    if defined(NDEBUG)
+#        define DAWN_FORCE_INLINE __forceinline
+#    endif
 
 #else
 #    error "Unsupported compiler"
@@ -97,5 +103,8 @@
 #if !defined(DAWN_NO_DISCARD)
 #    define DAWN_NO_DISCARD
 #endif
+#if !defined(DAWN_FORCE_INLINE)
+#    define DAWN_FORCE_INLINE inline
+#endif
 
 #endif  // COMMON_COMPILER_H_
diff --git a/src/common/Math.cpp b/src/common/Math.cpp
index a8823e5..2ae977b 100644
--- a/src/common/Math.cpp
+++ b/src/common/Math.cpp
@@ -85,13 +85,6 @@
     return (reinterpret_cast<size_t>(ptr) & (alignment - 1)) == 0;
 }
 
-void* AlignVoidPtr(void* ptr, size_t alignment) {
-    ASSERT(IsPowerOfTwo(alignment));
-    ASSERT(alignment != 0);
-    return reinterpret_cast<void*>((reinterpret_cast<size_t>(ptr) + (alignment - 1)) &
-                                   ~(alignment - 1));
-}
-
 bool IsAligned(uint32_t value, size_t alignment) {
     ASSERT(alignment <= UINT32_MAX);
     ASSERT(IsPowerOfTwo(alignment));
diff --git a/src/common/Math.h b/src/common/Math.h
index ac40dd9..5ee915e 100644
--- a/src/common/Math.h
+++ b/src/common/Math.h
@@ -15,6 +15,8 @@
 #ifndef COMMON_MATH_H_
 #define COMMON_MATH_H_
 
+#include "common/Assert.h"
+
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
@@ -35,13 +37,19 @@
 uint32_t Align(uint32_t value, size_t alignment);
 
 template <typename T>
-T* AlignPtr(T* ptr, size_t alignment) {
-    return static_cast<T*>(AlignVoidPtr(ptr, alignment));
+DAWN_FORCE_INLINE T* AlignPtr(T* ptr, size_t alignment) {
+    ASSERT(IsPowerOfTwo(alignment));
+    ASSERT(alignment != 0);
+    return reinterpret_cast<T*>((reinterpret_cast<size_t>(ptr) + (alignment - 1)) &
+                                ~(alignment - 1));
 }
 
 template <typename T>
-const T* AlignPtr(const T* ptr, size_t alignment) {
-    return static_cast<const T*>(AlignVoidPtr(const_cast<T*>(ptr), alignment));
+DAWN_FORCE_INLINE const T* AlignPtr(const T* ptr, size_t alignment) {
+    ASSERT(IsPowerOfTwo(alignment));
+    ASSERT(alignment != 0);
+    return reinterpret_cast<const T*>((reinterpret_cast<size_t>(ptr) + (alignment - 1)) &
+                                      ~(alignment - 1));
 }
 
 template <typename destType, typename sourceType>
diff --git a/src/dawn_native/CommandAllocator.cpp b/src/dawn_native/CommandAllocator.cpp
index 990c1c5..553f889 100644
--- a/src/dawn_native/CommandAllocator.cpp
+++ b/src/dawn_native/CommandAllocator.cpp
@@ -23,12 +23,9 @@
 
 namespace dawn_native {
 
-    constexpr uint32_t EndOfBlock = UINT_MAX;          // std::numeric_limits<uint32_t>::max();
-    constexpr uint32_t AdditionalData = UINT_MAX - 1;  // std::numeric_limits<uint32_t>::max() - 1;
-
     // TODO(cwallez@chromium.org): figure out a way to have more type safety for the iterator
 
-    CommandIterator::CommandIterator() : mEndOfBlock(EndOfBlock) {
+    CommandIterator::CommandIterator() {
         Reset();
     }
 
@@ -42,7 +39,7 @@
         }
     }
 
-    CommandIterator::CommandIterator(CommandIterator&& other) : mEndOfBlock(EndOfBlock) {
+    CommandIterator::CommandIterator(CommandIterator&& other) {
         if (!other.IsEmpty()) {
             mBlocks = std::move(other.mBlocks);
             other.Reset();
@@ -64,7 +61,7 @@
     }
 
     CommandIterator::CommandIterator(CommandAllocator&& allocator)
-        : mBlocks(allocator.AcquireBlocks()), mEndOfBlock(EndOfBlock) {
+        : mBlocks(allocator.AcquireBlocks()) {
         Reset();
     }
 
@@ -74,6 +71,17 @@
         return *this;
     }
 
+    bool CommandIterator::NextCommandIdInNewBlock(uint32_t* commandId) {
+        mCurrentBlock++;
+        if (mCurrentBlock >= mBlocks.size()) {
+            Reset();
+            *commandId = detail::kEndOfBlock;
+            return false;
+        }
+        mCurrentPtr = AlignPtr(mBlocks[mCurrentBlock].block, alignof(uint32_t));
+        return NextCommandId(commandId);
+    }
+
     void CommandIterator::Reset() {
         mCurrentBlock = 0;
 
@@ -97,47 +105,6 @@
         return mBlocks[0].block == reinterpret_cast<const uint8_t*>(&mEndOfBlock);
     }
 
-    bool CommandIterator::NextCommandId(uint32_t* commandId) {
-        uint8_t* idPtr = AlignPtr(mCurrentPtr, alignof(uint32_t));
-        ASSERT(idPtr + sizeof(uint32_t) <=
-               mBlocks[mCurrentBlock].block + mBlocks[mCurrentBlock].size);
-
-        uint32_t id = *reinterpret_cast<uint32_t*>(idPtr);
-
-        if (id == EndOfBlock) {
-            mCurrentBlock++;
-            if (mCurrentBlock >= mBlocks.size()) {
-                Reset();
-                *commandId = EndOfBlock;
-                return false;
-            }
-            mCurrentPtr = AlignPtr(mBlocks[mCurrentBlock].block, alignof(uint32_t));
-            return NextCommandId(commandId);
-        }
-
-        mCurrentPtr = idPtr + sizeof(uint32_t);
-        *commandId = id;
-        return true;
-    }
-
-    void* CommandIterator::NextCommand(size_t commandSize, size_t commandAlignment) {
-        uint8_t* commandPtr = AlignPtr(mCurrentPtr, commandAlignment);
-        ASSERT(commandPtr + sizeof(commandSize) <=
-               mBlocks[mCurrentBlock].block + mBlocks[mCurrentBlock].size);
-
-        mCurrentPtr = commandPtr + commandSize;
-        return commandPtr;
-    }
-
-    void* CommandIterator::NextData(size_t dataSize, size_t dataAlignment) {
-        uint32_t id;
-        bool hasId = NextCommandId(&id);
-        ASSERT(hasId);
-        ASSERT(id == AdditionalData);
-
-        return NextCommand(dataSize, dataAlignment);
-    }
-
     // Potential TODO(cwallez@chromium.org):
     //  - Host the size and pointer to next block in the block itself to avoid having an allocation
     //    in the vector
@@ -161,60 +128,23 @@
         ASSERT(mCurrentPtr != nullptr && mEndPtr != nullptr);
         ASSERT(IsPtrAligned(mCurrentPtr, alignof(uint32_t)));
         ASSERT(mCurrentPtr + sizeof(uint32_t) <= mEndPtr);
-        *reinterpret_cast<uint32_t*>(mCurrentPtr) = EndOfBlock;
+        *reinterpret_cast<uint32_t*>(mCurrentPtr) = detail::kEndOfBlock;
 
         mCurrentPtr = nullptr;
         mEndPtr = nullptr;
         return std::move(mBlocks);
     }
 
-    uint8_t* CommandAllocator::Allocate(uint32_t commandId,
-                                        size_t commandSize,
-                                        size_t commandAlignment) {
-        ASSERT(mCurrentPtr != nullptr);
-        ASSERT(mEndPtr != nullptr);
-        ASSERT(commandId != EndOfBlock);
-
-        // It should always be possible to allocate one id, for EndOfBlock tagging,
-        ASSERT(IsPtrAligned(mCurrentPtr, alignof(uint32_t)));
-        ASSERT(mEndPtr >= mCurrentPtr);
-        ASSERT(static_cast<size_t>(mEndPtr - mCurrentPtr) >= sizeof(uint32_t));
-
-        // The memory after the ID will contain the following:
-        //   - the current ID
-        //   - padding to align the command, maximum kMaxSupportedAlignment
-        //   - the command of size commandSize
-        //   - padding to align the next ID, maximum alignof(uint32_t)
-        //   - the next ID of size sizeof(uint32_t)
-        //
-        // To avoid checking for overflows at every step of the computations we compute an upper
-        // bound of the space that will be needed in addition to the command data.
-        static constexpr size_t kWorstCaseAdditionalSize =
-            sizeof(uint32_t) + kMaxSupportedAlignment + alignof(uint32_t) + sizeof(uint32_t);
-
-        // This can't overflow because by construction mCurrentPtr always has space for the next ID.
-        size_t remainingSize = static_cast<size_t>(mEndPtr - mCurrentPtr);
-
-        // The good case were we have enough space for the command data and upper bound of the
-        // extra required space.
-        if ((remainingSize >= kWorstCaseAdditionalSize) &&
-            (remainingSize - kWorstCaseAdditionalSize >= commandSize)) {
-            uint32_t* idAlloc = reinterpret_cast<uint32_t*>(mCurrentPtr);
-            *idAlloc = commandId;
-
-            uint8_t* commandAlloc = AlignPtr(mCurrentPtr + sizeof(uint32_t), commandAlignment);
-            mCurrentPtr = AlignPtr(commandAlloc + commandSize, alignof(uint32_t));
-
-            return commandAlloc;
-        }
-
-        // When there is not enough space, we signal the EndOfBlock, so that the iterator knows to
-        // move to the next one. EndOfBlock on the last block means the end of the commands.
+    uint8_t* CommandAllocator::AllocateInNewBlock(uint32_t commandId,
+                                                  size_t commandSize,
+                                                  size_t commandAlignment) {
+        // When there is not enough space, we signal the kEndOfBlock, so that the iterator knows
+        // to move to the next one. kEndOfBlock on the last block means the end of the commands.
         uint32_t* idAlloc = reinterpret_cast<uint32_t*>(mCurrentPtr);
-        *idAlloc = EndOfBlock;
+        *idAlloc = detail::kEndOfBlock;
 
         // We'll request a block that can contain at least the command ID, the command and an
-        // additional ID to contain the EndOfBlock tag.
+        // additional ID to contain the kEndOfBlock tag.
         size_t requestedBlockSize = commandSize + kWorstCaseAdditionalSize;
 
         // The computation of the request could overflow.
@@ -228,10 +158,6 @@
         return Allocate(commandId, commandSize, commandAlignment);
     }
 
-    uint8_t* CommandAllocator::AllocateData(size_t commandSize, size_t commandAlignment) {
-        return Allocate(AdditionalData, commandSize, commandAlignment);
-    }
-
     bool CommandAllocator::GetNewBlock(size_t minimumSize) {
         // Allocate blocks doubling sizes each time, to a maximum of 16k (or at least minimumSize).
         mLastAllocationSize =
diff --git a/src/dawn_native/CommandAllocator.h b/src/dawn_native/CommandAllocator.h
index 504ba7a..82de05c 100644
--- a/src/dawn_native/CommandAllocator.h
+++ b/src/dawn_native/CommandAllocator.h
@@ -15,6 +15,9 @@
 #ifndef DAWNNATIVE_COMMAND_ALLOCATOR_H_
 #define DAWNNATIVE_COMMAND_ALLOCATOR_H_
 
+#include "common/Assert.h"
+#include "common/Math.h"
+
 #include <cstddef>
 #include <cstdint>
 #include <vector>
@@ -56,6 +59,11 @@
     };
     using CommandBlocks = std::vector<BlockDef>;
 
+    namespace detail {
+        constexpr uint32_t kEndOfBlock = std::numeric_limits<uint32_t>::max();
+        constexpr uint32_t kAdditionalData = std::numeric_limits<uint32_t>::max() - 1;
+    }  // namespace detail
+
     class CommandAllocator;
 
     // TODO(cwallez@chromium.org): prevent copy for both iterator and allocator
@@ -91,15 +99,46 @@
       private:
         bool IsEmpty() const;
 
-        bool NextCommandId(uint32_t* commandId);
-        void* NextCommand(size_t commandSize, size_t commandAlignment);
-        void* NextData(size_t dataSize, size_t dataAlignment);
+        DAWN_FORCE_INLINE bool NextCommandId(uint32_t* commandId) {
+            uint8_t* idPtr = AlignPtr(mCurrentPtr, alignof(uint32_t));
+            ASSERT(idPtr + sizeof(uint32_t) <=
+                   mBlocks[mCurrentBlock].block + mBlocks[mCurrentBlock].size);
+
+            uint32_t id = *reinterpret_cast<uint32_t*>(idPtr);
+
+            if (id != detail::kEndOfBlock) {
+                mCurrentPtr = idPtr + sizeof(uint32_t);
+                *commandId = id;
+                return true;
+            }
+            return NextCommandIdInNewBlock(commandId);
+        }
+
+        bool NextCommandIdInNewBlock(uint32_t* commandId);
+
+        DAWN_FORCE_INLINE void* NextCommand(size_t commandSize, size_t commandAlignment) {
+            uint8_t* commandPtr = AlignPtr(mCurrentPtr, commandAlignment);
+            ASSERT(commandPtr + sizeof(commandSize) <=
+                   mBlocks[mCurrentBlock].block + mBlocks[mCurrentBlock].size);
+
+            mCurrentPtr = commandPtr + commandSize;
+            return commandPtr;
+        }
+
+        DAWN_FORCE_INLINE void* NextData(size_t dataSize, size_t dataAlignment) {
+            uint32_t id;
+            bool hasId = NextCommandId(&id);
+            ASSERT(hasId);
+            ASSERT(id == detail::kAdditionalData);
+
+            return NextCommand(dataSize, dataAlignment);
+        }
 
         CommandBlocks mBlocks;
         uint8_t* mCurrentPtr = nullptr;
         size_t mCurrentBlock = 0;
         // Used to avoid a special case for empty iterators.
-        uint32_t mEndOfBlock;
+        uint32_t mEndOfBlock = detail::kEndOfBlock;
         bool mDataWasDestroyed = false;
     };
 
@@ -140,18 +179,67 @@
         // using the CommandAllocator passes the static_asserts.
         static constexpr size_t kMaxSupportedAlignment = 8;
 
+        // To avoid checking for overflows at every step of the computations we compute an upper
+        // bound of the space that will be needed in addition to the command data.
+        static constexpr size_t kWorstCaseAdditionalSize =
+            sizeof(uint32_t) + kMaxSupportedAlignment + alignof(uint32_t) + sizeof(uint32_t);
+
         friend CommandIterator;
         CommandBlocks&& AcquireBlocks();
 
-        uint8_t* Allocate(uint32_t commandId, size_t commandSize, size_t commandAlignment);
-        uint8_t* AllocateData(size_t dataSize, size_t dataAlignment);
+        DAWN_FORCE_INLINE uint8_t* Allocate(uint32_t commandId,
+                                            size_t commandSize,
+                                            size_t commandAlignment) {
+            ASSERT(mCurrentPtr != nullptr);
+            ASSERT(mEndPtr != nullptr);
+            ASSERT(commandId != detail::kEndOfBlock);
+
+            // It should always be possible to allocate one id, for kEndOfBlock tagging,
+            ASSERT(IsPtrAligned(mCurrentPtr, alignof(uint32_t)));
+            ASSERT(mEndPtr >= mCurrentPtr);
+            ASSERT(static_cast<size_t>(mEndPtr - mCurrentPtr) >= sizeof(uint32_t));
+
+            // The memory after the ID will contain the following:
+            //   - the current ID
+            //   - padding to align the command, maximum kMaxSupportedAlignment
+            //   - the command of size commandSize
+            //   - padding to align the next ID, maximum alignof(uint32_t)
+            //   - the next ID of size sizeof(uint32_t)
+
+            // This can't overflow because by construction mCurrentPtr always has space for the next
+            // ID.
+            size_t remainingSize = static_cast<size_t>(mEndPtr - mCurrentPtr);
+
+            // The good case were we have enough space for the command data and upper bound of the
+            // extra required space.
+            if ((remainingSize >= kWorstCaseAdditionalSize) &&
+                (remainingSize - kWorstCaseAdditionalSize >= commandSize)) {
+                uint32_t* idAlloc = reinterpret_cast<uint32_t*>(mCurrentPtr);
+                *idAlloc = commandId;
+
+                uint8_t* commandAlloc = AlignPtr(mCurrentPtr + sizeof(uint32_t), commandAlignment);
+                mCurrentPtr = AlignPtr(commandAlloc + commandSize, alignof(uint32_t));
+
+                return commandAlloc;
+            }
+            return AllocateInNewBlock(commandId, commandSize, commandAlignment);
+        }
+
+        uint8_t* AllocateInNewBlock(uint32_t commandId,
+                                    size_t commandSize,
+                                    size_t commandAlignment);
+
+        DAWN_FORCE_INLINE uint8_t* AllocateData(size_t commandSize, size_t commandAlignment) {
+            return Allocate(detail::kAdditionalData, commandSize, commandAlignment);
+        }
+
         bool GetNewBlock(size_t minimumSize);
 
         CommandBlocks mBlocks;
         size_t mLastAllocationSize = 2048;
 
         // Pointers to the current range of allocation in the block. Guaranteed to allow for at
-        // least one uint32_t if not nullptr, so that the special EndOfBlock command id can always
+        // least one uint32_t if not nullptr, so that the special kEndOfBlock command id can always
         // be written. Nullptr iff the blocks were moved out.
         uint8_t* mCurrentPtr = nullptr;
         uint8_t* mEndPtr = nullptr;