Add method to reduce memory usage by dropping scratch buffers

Add dawn::native::ReduceMemoryUsage() to be called by Chromium when
going idle or on memory pressure. Currently, this only drops scratch
buffers e.g. DynamicUploader, InternalPipelineStore, etc.

Bug: 357139493
Change-Id: Ida06b851f19eb95982980f1649c118ec69fea43b
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/202277
Reviewed-by: Loko Kung <lokokung@google.com>
Auto-Submit: Sunny Sachanandani <sunnyps@chromium.org>
Reviewed-by: Austin Eng <enga@chromium.org>
Commit-Queue: Austin Eng <enga@chromium.org>
diff --git a/include/dawn/native/DawnNative.h b/include/dawn/native/DawnNative.h
index 594c387..50e4ccf 100644
--- a/include/dawn/native/DawnNative.h
+++ b/include/dawn/native/DawnNative.h
@@ -302,6 +302,9 @@
 // total estimated memory usage, and is intended for background tracing for UMA.
 DAWN_NATIVE_EXPORT uint64_t ComputeEstimatedMemoryUsage(WGPUDevice device);
 
+// Free any unused GPU memory like staging buffers, cached resources, etc.
+DAWN_NATIVE_EXPORT void ReduceMemoryUsage(WGPUDevice device);
+
 }  // namespace dawn::native
 
 #endif  // INCLUDE_DAWN_NATIVE_DAWNNATIVE_H_
diff --git a/src/dawn/native/DawnNative.cpp b/src/dawn/native/DawnNative.cpp
index 885e62c..3b84d98 100644
--- a/src/dawn/native/DawnNative.cpp
+++ b/src/dawn/native/DawnNative.cpp
@@ -306,11 +306,18 @@
 }
 
 void DumpMemoryStatistics(WGPUDevice device, MemoryDump* dump) {
+    auto deviceLock(FromAPI(device)->GetScopedLock());
     FromAPI(device)->DumpMemoryStatistics(dump);
 }
 
 uint64_t ComputeEstimatedMemoryUsage(WGPUDevice device) {
+    auto deviceLock(FromAPI(device)->GetScopedLock());
     return FromAPI(device)->ComputeEstimatedMemoryUsage();
 }
 
+void ReduceMemoryUsage(WGPUDevice device) {
+    auto deviceLock(FromAPI(device)->GetScopedLock());
+    FromAPI(device)->ReduceMemoryUsage();
+}
+
 }  // namespace dawn::native
diff --git a/src/dawn/native/Device.cpp b/src/dawn/native/Device.cpp
index e58623f..7a9238a 100644
--- a/src/dawn/native/Device.cpp
+++ b/src/dawn/native/Device.cpp
@@ -2588,6 +2588,7 @@
 }
 
 void DeviceBase::DumpMemoryStatistics(dawn::native::MemoryDump* dump) const {
+    DAWN_ASSERT(IsLockedByCurrentThreadIfNeeded());
     std::string prefix = absl::StrFormat("device_%p", static_cast<const void*>(this));
     GetObjectTrackingList(ObjectType::Texture)->ForEach([&](const ApiObjectBase* texture) {
         static_cast<const TextureBase*>(texture)->DumpMemoryStatistics(dump, prefix.c_str());
@@ -2598,6 +2599,7 @@
 }
 
 uint64_t DeviceBase::ComputeEstimatedMemoryUsage() const {
+    DAWN_ASSERT(IsLockedByCurrentThreadIfNeeded());
     uint64_t size = 0;
     GetObjectTrackingList(ObjectType::Texture)->ForEach([&](const ApiObjectBase* texture) {
         size += static_cast<const TextureBase*>(texture)->ComputeEstimatedByteSize();
@@ -2608,6 +2610,16 @@
     return size;
 }
 
+void DeviceBase::ReduceMemoryUsage() {
+    DAWN_ASSERT(IsLockedByCurrentThreadIfNeeded());
+    if (ConsumedError(GetQueue()->CheckPassedSerials())) {
+        return;
+    }
+    GetDynamicUploader()->Deallocate(GetQueue()->GetCompletedCommandSerial(), /*freeAll=*/true);
+    mInternalPipelineStore->ResetScratchBuffers();
+    mTemporaryUniformBuffer = nullptr;
+}
+
 ResultOrError<Ref<BufferBase>> DeviceBase::GetOrCreateTemporaryUniformBuffer(size_t size) {
     if (!mTemporaryUniformBuffer || mTemporaryUniformBuffer->GetSize() != size) {
         BufferDescriptor desc;
diff --git a/src/dawn/native/Device.h b/src/dawn/native/Device.h
index 3dc04ec..a731f60 100644
--- a/src/dawn/native/Device.h
+++ b/src/dawn/native/Device.h
@@ -456,6 +456,7 @@
 
     void DumpMemoryStatistics(dawn::native::MemoryDump* dump) const;
     uint64_t ComputeEstimatedMemoryUsage() const;
+    void ReduceMemoryUsage();
 
     ResultOrError<Ref<BufferBase>> GetOrCreateTemporaryUniformBuffer(size_t size);
 
diff --git a/src/dawn/native/DynamicUploader.cpp b/src/dawn/native/DynamicUploader.cpp
index 0922df2..77930ac 100644
--- a/src/dawn/native/DynamicUploader.cpp
+++ b/src/dawn/native/DynamicUploader.cpp
@@ -122,7 +122,7 @@
     return uploadHandle;
 }
 
-void DynamicUploader::Deallocate(ExecutionSerial lastCompletedSerial) {
+void DynamicUploader::Deallocate(ExecutionSerial lastCompletedSerial, bool freeAll) {
     // Reclaim memory within the ring buffers by ticking (or removing requests no longer
     // in-flight).
     size_t i = 0;
@@ -130,8 +130,9 @@
         mRingBuffers[i]->mAllocator.Deallocate(lastCompletedSerial);
 
         // Never erase the last buffer as to prevent re-creating smaller buffers
-        // again. The last buffer is the largest.
-        if (mRingBuffers[i]->mAllocator.Empty() && i < mRingBuffers.size() - 1) {
+        // again unless explicitly asked to do so. The last buffer is the largest.
+        const bool shouldFree = (i < mRingBuffers.size() - 1) || freeAll;
+        if (mRingBuffers[i]->mAllocator.Empty() && shouldFree) {
             mRingBuffers.erase(mRingBuffers.begin() + i);
         } else {
             i++;
diff --git a/src/dawn/native/DynamicUploader.h b/src/dawn/native/DynamicUploader.h
index 88bedfa..c3adbe3 100644
--- a/src/dawn/native/DynamicUploader.h
+++ b/src/dawn/native/DynamicUploader.h
@@ -64,7 +64,7 @@
     ResultOrError<UploadHandle> Allocate(uint64_t allocationSize,
                                          ExecutionSerial serial,
                                          uint64_t offsetAlignment);
-    void Deallocate(ExecutionSerial lastCompletedSerial);
+    void Deallocate(ExecutionSerial lastCompletedSerial, bool freeAll = false);
 
     bool ShouldFlush();
 
diff --git a/src/dawn/native/InternalPipelineStore.cpp b/src/dawn/native/InternalPipelineStore.cpp
index 52d8ee4..58d1d4d 100644
--- a/src/dawn/native/InternalPipelineStore.cpp
+++ b/src/dawn/native/InternalPipelineStore.cpp
@@ -45,4 +45,9 @@
 
 InternalPipelineStore::~InternalPipelineStore() = default;
 
+void InternalPipelineStore::ResetScratchBuffers() {
+    scratchStorage.Reset();
+    scratchIndirectStorage.Reset();
+}
+
 }  // namespace dawn::native
diff --git a/src/dawn/native/InternalPipelineStore.h b/src/dawn/native/InternalPipelineStore.h
index 294fafa..527cc35 100644
--- a/src/dawn/native/InternalPipelineStore.h
+++ b/src/dawn/native/InternalPipelineStore.h
@@ -64,6 +64,8 @@
 
     Ref<ShaderModuleBase> placeholderFragmentShader;
 
+    void ResetScratchBuffers();
+
     // A scratch buffer suitable for use as a copy destination and storage binding.
     ScratchBuffer scratchStorage;
 
diff --git a/src/dawn/native/Queue.h b/src/dawn/native/Queue.h
index 9f6c3ef..d0a53bf 100644
--- a/src/dawn/native/Queue.h
+++ b/src/dawn/native/Queue.h
@@ -116,6 +116,17 @@
 
     void DestroyImpl() override;
 
+    virtual MaybeError SubmitImpl(uint32_t commandCount, CommandBufferBase* const* commands) = 0;
+    virtual MaybeError WriteBufferImpl(BufferBase* buffer,
+                                       uint64_t bufferOffset,
+                                       const void* data,
+                                       size_t size);
+    virtual MaybeError WriteTextureImpl(const ImageCopyTexture& destination,
+                                        const void* data,
+                                        size_t dataSize,
+                                        const TextureDataLayout& dataLayout,
+                                        const Extent3D& writeSize);
+
   private:
     MaybeError WriteTextureInternal(const ImageCopyTexture* destination,
                                     const void* data,
@@ -131,17 +142,6 @@
                                                      const Extent3D* copySize,
                                                      const CopyTextureForBrowserOptions* options);
 
-    virtual MaybeError SubmitImpl(uint32_t commandCount, CommandBufferBase* const* commands) = 0;
-    virtual MaybeError WriteBufferImpl(BufferBase* buffer,
-                                       uint64_t bufferOffset,
-                                       const void* data,
-                                       size_t size);
-    virtual MaybeError WriteTextureImpl(const ImageCopyTexture& destination,
-                                        const void* data,
-                                        size_t dataSize,
-                                        const TextureDataLayout& dataLayout,
-                                        const Extent3D& writeSize);
-
     MaybeError ValidateSubmit(uint32_t commandCount, CommandBufferBase* const* commands) const;
     MaybeError ValidateOnSubmittedWorkDone(wgpu::QueueWorkDoneStatus* status) const;
     MaybeError ValidateWriteTexture(const ImageCopyTexture* destination,
diff --git a/src/dawn/tests/unittests/native/MemoryInstrumentationTests.cpp b/src/dawn/tests/unittests/native/MemoryInstrumentationTests.cpp
index ceb9a5e..ca1dfee 100644
--- a/src/dawn/tests/unittests/native/MemoryInstrumentationTests.cpp
+++ b/src/dawn/tests/unittests/native/MemoryInstrumentationTests.cpp
@@ -25,7 +25,9 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+#include <chrono>
 #include <string>
+#include <thread>
 #include <utility>
 
 #include "dawn/native/DawnNative.h"
@@ -207,5 +209,35 @@
         kBufferAllocatedSize + kMipmappedTextureSize + kMultisampleTextureSize + kETC2TextureSize);
 }
 
+TEST_F(MemoryInstrumentationTest, ReduceMemoryUsage) {
+    constexpr uint64_t kBufferSize = 32;
+    constexpr wgpu::BufferDescriptor kBufferDesc = {
+        .usage = wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst,
+        .size = kBufferSize,
+    };
+    wgpu::Buffer uniformBuffer = device.CreateBuffer(&kBufferDesc);
+    EXPECT_TRUE(uniformBuffer);
+
+    std::array<uint8_t, kBufferSize> zeroes = {};
+    device.GetQueue().WriteBuffer(uniformBuffer, 0, zeroes.data(), zeroes.size());
+    device.GetQueue().Submit(0, nullptr);
+
+    uniformBuffer.Destroy();
+
+    wgpu::Future completionFuture = device.GetQueue().OnSubmittedWorkDone(
+        wgpu::CallbackMode::WaitAnyOnly, [](wgpu::QueueWorkDoneStatus status) {});
+
+    wgpu::WaitStatus waitStatus = wgpu::WaitStatus::TimedOut;
+    while (waitStatus != wgpu::WaitStatus::Success) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        waitStatus = wgpu::Instance(ToAPI(mDeviceMock->GetInstance())).WaitAny(completionFuture, 0);
+    }
+
+    // DynamicUploader buffers will still be alive.
+    EXPECT_GT(ComputeEstimatedMemoryUsage(device.Get()), uint64_t(0));
+    ReduceMemoryUsage(device.Get());
+    EXPECT_EQ(ComputeEstimatedMemoryUsage(device.Get()), uint64_t(0));
+}
+
 }  // namespace
 }  // namespace dawn::native
diff --git a/src/dawn/tests/unittests/native/mocks/QueueMock.cpp b/src/dawn/tests/unittests/native/mocks/QueueMock.cpp
index b66e6c6..a58050c 100644
--- a/src/dawn/tests/unittests/native/mocks/QueueMock.cpp
+++ b/src/dawn/tests/unittests/native/mocks/QueueMock.cpp
@@ -29,11 +29,27 @@
 
 #include "dawn/tests/unittests/native/mocks/DeviceMock.h"
 
+using testing::WithArgs;
+
 namespace dawn::native {
 
 QueueMock::QueueMock(DeviceMock* device, const QueueDescriptor* descriptor)
     : QueueBase(device, descriptor) {
     ON_CALL(*this, DestroyImpl).WillByDefault([this] { this->QueueBase::DestroyImpl(); });
+    ON_CALL(*this, SubmitImpl)
+        .WillByDefault([this](uint32_t, CommandBufferBase* const*) -> MaybeError {
+            this->QueueBase::IncrementLastSubmittedCommandSerial();
+            return {};
+        });
+    ON_CALL(*this, CheckAndUpdateCompletedSerials)
+        .WillByDefault([this]() -> ResultOrError<ExecutionSerial> {
+            return this->QueueBase::GetLastSubmittedCommandSerial();
+        });
+    ON_CALL(*this, WriteBufferImpl)
+        .WillByDefault(WithArgs<0, 1, 2, 3>([this](BufferBase* buffer, uint64_t bufferOffset,
+                                                   const void* data, size_t size) -> MaybeError {
+            return this->QueueBase::WriteBufferImpl(buffer, bufferOffset, data, size);
+        }));
 }
 
 QueueMock::~QueueMock() = default;