src/dawn_native/metal/DeviceMTL.mm - dawn - Git at Google

 // Copyright 2018 The Dawn Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "dawn_native/metal/DeviceMTL.h"

 #include "common/GPUInfo.h"
 #include "common/Platform.h"
 #include "dawn_native/BackendConnection.h"
 #include "dawn_native/BindGroupLayout.h"
 #include "dawn_native/Commands.h"
 #include "dawn_native/ErrorData.h"
 #include "dawn_native/metal/BindGroupLayoutMTL.h"
 #include "dawn_native/metal/BindGroupMTL.h"
 #include "dawn_native/metal/BufferMTL.h"
 #include "dawn_native/metal/CommandBufferMTL.h"
 #include "dawn_native/metal/ComputePipelineMTL.h"
 #include "dawn_native/metal/PipelineLayoutMTL.h"
 #include "dawn_native/metal/QuerySetMTL.h"
 #include "dawn_native/metal/QueueMTL.h"
 #include "dawn_native/metal/RenderPipelineMTL.h"
 #include "dawn_native/metal/SamplerMTL.h"
 #include "dawn_native/metal/ShaderModuleMTL.h"
 #include "dawn_native/metal/StagingBufferMTL.h"
 #include "dawn_native/metal/SwapChainMTL.h"
 #include "dawn_native/metal/TextureMTL.h"
 #include "dawn_native/metal/UtilsMetal.h"
 #include "dawn_platform/DawnPlatform.h"
 #include "dawn_platform/tracing/TraceEvent.h"

 #include <type_traits>

 namespace dawn_native { namespace metal {

     namespace {

         // The time interval for each round of kalman filter
         static constexpr uint64_t kFilterIntervalInMs = static_cast<uint64_t>(NSEC_PER_SEC / 10);

         struct KalmanInfo {
             float filterValue;  // The estimation value
             float kalmanGain;   // The kalman gain
             float R;            // The covariance of the observation noise
             float P;            // The a posteriori estimate covariance
         };

         // A simplified kalman filter for estimating timestamp period based on measured values
         float KalmanFilter(KalmanInfo* info, float measuredValue) {
             // Optimize kalman gain
             info->kalmanGain = info->P / (info->P + info->R);

             // Correct filter value
             info->filterValue =
                 info->kalmanGain * measuredValue + (1.0 - info->kalmanGain) * info->filterValue;
             // Update estimate covariance
             info->P = (1.0f - info->kalmanGain) * info->P;
             return info->filterValue;
         }

         void API_AVAILABLE(macos(10.15), ios(14))
             UpdateTimestampPeriod(id<MTLDevice> device,
                                   KalmanInfo* info,
                                   MTLTimestamp* cpuTimestampStart,
                                   MTLTimestamp* gpuTimestampStart,
                                   float* timestampPeriod) {
             // The filter value is converged to an optimal value when the kalman gain is less than
             // 0.01. At this time, the weight of the measured value is too small to change the next
             // filter value, the sampling and calculations do not need to continue anymore.
             if (info->kalmanGain < 0.01f) {
                 return;
             }

             MTLTimestamp cpuTimestampEnd = 0, gpuTimestampEnd = 0;
             [device sampleTimestamps:&cpuTimestampEnd gpuTimestamp:&gpuTimestampEnd];

             // Update the timestamp start values when timestamp reset happens
             if (cpuTimestampEnd < *cpuTimestampStart || gpuTimestampEnd < *gpuTimestampStart) {
                 *cpuTimestampStart = cpuTimestampEnd;
                 *gpuTimestampStart = gpuTimestampEnd;
                 return;
             }

             if (cpuTimestampEnd - *cpuTimestampStart >= kFilterIntervalInMs) {
                 // The measured timestamp period
                 float measurement = (cpuTimestampEnd - *cpuTimestampStart) /
                                     static_cast<float>(gpuTimestampEnd - *gpuTimestampStart);

                 // Measurement update
                 *timestampPeriod = KalmanFilter(info, measurement);

                 *cpuTimestampStart = cpuTimestampEnd;
                 *gpuTimestampStart = gpuTimestampEnd;
             }
         }

     }  // namespace

     // static
     ResultOrError<Device*> Device::Create(AdapterBase* adapter,
                                           NSPRef<id<MTLDevice>> mtlDevice,
                                           const DawnDeviceDescriptor* descriptor) {
         Ref<Device> device = AcquireRef(new Device(adapter, std::move(mtlDevice), descriptor));
         DAWN_TRY(device->Initialize());
         return device.Detach();
     }

     Device::Device(AdapterBase* adapter,
                    NSPRef<id<MTLDevice>> mtlDevice,
                    const DawnDeviceDescriptor* descriptor)
         : DeviceBase(adapter, descriptor), mMtlDevice(std::move(mtlDevice)), mCompletedSerial(0) {
     }

     Device::~Device() {
         Destroy();
     }

     MaybeError Device::Initialize() {
         InitTogglesFromDriver();

         mCommandQueue.Acquire([*mMtlDevice newCommandQueue]);
         if (mCommandQueue == nil) {
             return DAWN_INTERNAL_ERROR("Failed to allocate MTLCommandQueue.");
         }

         DAWN_TRY(mCommandContext.PrepareNextCommandBuffer(*mCommandQueue));

         if (IsFeatureEnabled(Feature::TimestampQuery)) {
             // Make a best guess of timestamp period based on device vendor info, and converge it to
             // an accurate value by the following calculations.
             mTimestampPeriod =
                 gpu_info::IsIntel(GetAdapter()->GetPCIInfo().vendorId) ? 83.333f : 1.0f;

             // Initialize kalman filter parameters
             mKalmanInfo = std::make_unique<KalmanInfo>();
             mKalmanInfo->filterValue = 0.0f;
             mKalmanInfo->kalmanGain = 0.5f;
             mKalmanInfo->R =
                 0.0001f;  // The smaller this value is, the smaller the error of measured value is,
                           // the more we can trust the measured value.
             mKalmanInfo->P = 1.0f;

             if (@available(macos 10.15, iOS 14.0, *)) {
                 // Sample CPU timestamp and GPU timestamp for first time at device creation
                 [*mMtlDevice sampleTimestamps:&mCpuTimestamp gpuTimestamp:&mGpuTimestamp];
             }
         }

         return DeviceBase::Initialize(new Queue(this));
     }

     void Device::InitTogglesFromDriver() {
         {
             bool haveStoreAndMSAAResolve = false;
 #if defined(DAWN_PLATFORM_MACOS)
             if (@available(macOS 10.12, *)) {
                 haveStoreAndMSAAResolve =
                     [*mMtlDevice supportsFeatureSet:MTLFeatureSet_macOS_GPUFamily1_v2];
             }
 #elif defined(DAWN_PLATFORM_IOS)
             haveStoreAndMSAAResolve =
                 [*mMtlDevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v2];
 #endif
             // On tvOS, we would need MTLFeatureSet_tvOS_GPUFamily2_v1.
             SetToggle(Toggle::EmulateStoreAndMSAAResolve, !haveStoreAndMSAAResolve);

             bool haveSamplerCompare = true;
 #if defined(DAWN_PLATFORM_IOS)
             haveSamplerCompare = [*mMtlDevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v1];
 #endif
             // TODO(crbug.com/dawn/342): Investigate emulation -- possibly expensive.
             SetToggle(Toggle::MetalDisableSamplerCompare, !haveSamplerCompare);

             bool haveBaseVertexBaseInstance = true;
 #if defined(DAWN_PLATFORM_IOS)
             haveBaseVertexBaseInstance =
                 [*mMtlDevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v1];
 #endif
             // TODO(crbug.com/dawn/343): Investigate emulation.
             SetToggle(Toggle::DisableBaseVertex, !haveBaseVertexBaseInstance);
             SetToggle(Toggle::DisableBaseInstance, !haveBaseVertexBaseInstance);
         }

         // Vertex buffer robustness is implemented by using programmable vertex pulling. Enable
         // that code path if it isn't explicitly disabled.
         if (IsRobustnessEnabled()) {
             SetToggle(Toggle::MetalEnableVertexPulling, true);
         }

         // TODO(crbug.com/dawn/846): tighten this workaround when the driver bug is fixed.
         SetToggle(Toggle::AlwaysResolveIntoZeroLevelAndLayer, true);

         const PCIInfo& pciInfo = GetAdapter()->GetPCIInfo();

         // TODO(crbug.com/dawn/847): Use MTLStorageModeShared instead of MTLStorageModePrivate when
         // creating MTLCounterSampleBuffer in QuerySet on Intel platforms, otherwise it fails to
         // create the buffer. Change to use MTLStorageModePrivate when the bug is fixed.
         if (@available(macOS 10.15, iOS 14.0, *)) {
             bool useSharedMode = gpu_info::IsIntel(pciInfo.vendorId);
             SetToggle(Toggle::MetalUseSharedModeForCounterSampleBuffer, useSharedMode);
         }

         // TODO(crbug.com/dawn/1071): r8unorm and rg8unorm textures with multiple mip levels don't
         // clear properly on Intel Macs.
         if (gpu_info::IsIntel(pciInfo.vendorId)) {
             SetToggle(Toggle::DisableR8RG8Mipmaps, true);
         }

         // On some Intel GPU vertex only render pipeline get wrong depth result if no fragment
         // shader provided. Create a dummy fragment shader module to work around this issue.
         if (gpu_info::IsIntel(this->GetAdapter()->GetPCIInfo().vendorId)) {
             bool useDummyFragmentShader = true;
             if (gpu_info::IsSkylake(this->GetAdapter()->GetPCIInfo().deviceId)) {
                 useDummyFragmentShader = false;
             }
             SetToggle(Toggle::UseDummyFragmentInVertexOnlyPipeline, useDummyFragmentShader);
         }
     }

     ResultOrError<Ref<BindGroupBase>> Device::CreateBindGroupImpl(
         const BindGroupDescriptor* descriptor) {
         return BindGroup::Create(this, descriptor);
     }
     ResultOrError<Ref<BindGroupLayoutBase>> Device::CreateBindGroupLayoutImpl(
         const BindGroupLayoutDescriptor* descriptor,
         PipelineCompatibilityToken pipelineCompatibilityToken) {
         return BindGroupLayout::Create(this, descriptor, pipelineCompatibilityToken);
     }
     ResultOrError<Ref<BufferBase>> Device::CreateBufferImpl(const BufferDescriptor* descriptor) {
         return Buffer::Create(this, descriptor);
     }
     ResultOrError<Ref<CommandBufferBase>> Device::CreateCommandBuffer(
         CommandEncoder* encoder,
         const CommandBufferDescriptor* descriptor) {
         return CommandBuffer::Create(encoder, descriptor);
     }
     Ref<ComputePipelineBase> Device::CreateUninitializedComputePipelineImpl(
         const ComputePipelineDescriptor* descriptor) {
         return ComputePipeline::CreateUninitialized(this, descriptor);
     }
     ResultOrError<Ref<PipelineLayoutBase>> Device::CreatePipelineLayoutImpl(
         const PipelineLayoutDescriptor* descriptor) {
         return PipelineLayout::Create(this, descriptor);
     }
     ResultOrError<Ref<QuerySetBase>> Device::CreateQuerySetImpl(
         const QuerySetDescriptor* descriptor) {
         return QuerySet::Create(this, descriptor);
     }
     Ref<RenderPipelineBase> Device::CreateUninitializedRenderPipelineImpl(
         const RenderPipelineDescriptor* descriptor) {
         return RenderPipeline::CreateUninitialized(this, descriptor);
     }
     ResultOrError<Ref<SamplerBase>> Device::CreateSamplerImpl(const SamplerDescriptor* descriptor) {
         return Sampler::Create(this, descriptor);
     }
     ResultOrError<Ref<ShaderModuleBase>> Device::CreateShaderModuleImpl(
         const ShaderModuleDescriptor* descriptor,
         ShaderModuleParseResult* parseResult) {
         return ShaderModule::Create(this, descriptor, parseResult);
     }
     ResultOrError<Ref<SwapChainBase>> Device::CreateSwapChainImpl(
         const SwapChainDescriptor* descriptor) {
         return OldSwapChain::Create(this, descriptor);
     }
     ResultOrError<Ref<NewSwapChainBase>> Device::CreateSwapChainImpl(
         Surface* surface,
         NewSwapChainBase* previousSwapChain,
         const SwapChainDescriptor* descriptor) {
         return SwapChain::Create(this, surface, previousSwapChain, descriptor);
     }
     ResultOrError<Ref<TextureBase>> Device::CreateTextureImpl(const TextureDescriptor* descriptor) {
         return Texture::Create(this, descriptor);
     }
     ResultOrError<Ref<TextureViewBase>> Device::CreateTextureViewImpl(
         TextureBase* texture,
         const TextureViewDescriptor* descriptor) {
         return TextureView::Create(texture, descriptor);
     }
     void Device::InitializeComputePipelineAsyncImpl(Ref<ComputePipelineBase> computePipeline,
                                                     WGPUCreateComputePipelineAsyncCallback callback,
                                                     void* userdata) {
         ComputePipeline::InitializeAsync(std::move(computePipeline), callback, userdata);
     }
     void Device::InitializeRenderPipelineAsyncImpl(Ref<RenderPipelineBase> renderPipeline,
                                                    WGPUCreateRenderPipelineAsyncCallback callback,
                                                    void* userdata) {
         RenderPipeline::InitializeAsync(std::move(renderPipeline), callback, userdata);
     }

     ResultOrError<ExecutionSerial> Device::CheckAndUpdateCompletedSerials() {
         uint64_t frontendCompletedSerial{GetCompletedCommandSerial()};
         if (frontendCompletedSerial > mCompletedSerial) {
             // sometimes we increase the serials, in which case the completed serial in
             // the device base will surpass the completed serial we have in the metal backend, so we
             // must update ours when we see that the completed serial from device base has
             // increased.
             mCompletedSerial = frontendCompletedSerial;
         }
         return ExecutionSerial(mCompletedSerial.load());
     }

     MaybeError Device::TickImpl() {
         DAWN_TRY(SubmitPendingCommandBuffer());

         // Just run timestamp period calculation when timestamp feature is enabled.
         if (IsFeatureEnabled(Feature::TimestampQuery)) {
             if (@available(macos 10.15, iOS 14.0, *)) {
                 UpdateTimestampPeriod(GetMTLDevice(), mKalmanInfo.get(), &mCpuTimestamp,
                                       &mGpuTimestamp, &mTimestampPeriod);
             }
         }

         return {};
     }

     id<MTLDevice> Device::GetMTLDevice() {
         return mMtlDevice.Get();
     }

     id<MTLCommandQueue> Device::GetMTLQueue() {
         return mCommandQueue.Get();
     }

     CommandRecordingContext* Device::GetPendingCommandContext() {
         mCommandContext.MarkUsed();
         return &mCommandContext;
     }

     MaybeError Device::SubmitPendingCommandBuffer() {
         if (!mCommandContext.WasUsed()) {
             return {};
         }

         IncrementLastSubmittedCommandSerial();

         // Acquire the pending command buffer, which is retained. It must be released later.
         NSPRef<id<MTLCommandBuffer>> pendingCommands = mCommandContext.AcquireCommands();

         // Replace mLastSubmittedCommands with the mutex held so we avoid races between the
         // schedule handler and this code.
         {
             std::lock_guard<std::mutex> lock(mLastSubmittedCommandsMutex);
             mLastSubmittedCommands = pendingCommands;
         }

         // Make a local copy of the pointer to the commands because it's not clear how ObjC blocks
         // handle types with copy / move constructors being referenced in the block..
         id<MTLCommandBuffer> pendingCommandsPointer = pendingCommands.Get();
         [*pendingCommands addScheduledHandler:^(id<MTLCommandBuffer>) {
             // This is DRF because we hold the mutex for mLastSubmittedCommands and pendingCommands
             // is a local value (and not the member itself).
             std::lock_guard<std::mutex> lock(mLastSubmittedCommandsMutex);
             if (this->mLastSubmittedCommands.Get() == pendingCommandsPointer) {
                 this->mLastSubmittedCommands = nullptr;
             }
         }];

         // Update the completed serial once the completed handler is fired. Make a local copy of
         // mLastSubmittedSerial so it is captured by value.
         ExecutionSerial pendingSerial = GetLastSubmittedCommandSerial();
         // this ObjC block runs on a different thread
         [*pendingCommands addCompletedHandler:^(id<MTLCommandBuffer>) {
             TRACE_EVENT_ASYNC_END0(GetPlatform(), GPUWork, "DeviceMTL::SubmitPendingCommandBuffer",
                                    uint64_t(pendingSerial));
             ASSERT(uint64_t(pendingSerial) > mCompletedSerial.load());
             this->mCompletedSerial = uint64_t(pendingSerial);
         }];

         TRACE_EVENT_ASYNC_BEGIN0(GetPlatform(), GPUWork, "DeviceMTL::SubmitPendingCommandBuffer",
                                  uint64_t(pendingSerial));
         [*pendingCommands commit];

         return mCommandContext.PrepareNextCommandBuffer(*mCommandQueue);
     }

     ResultOrError<std::unique_ptr<StagingBufferBase>> Device::CreateStagingBuffer(size_t size) {
         std::unique_ptr<StagingBufferBase> stagingBuffer =
             std::make_unique<StagingBuffer>(size, this);
         DAWN_TRY(stagingBuffer->Initialize());
         return std::move(stagingBuffer);
     }

     MaybeError Device::CopyFromStagingToBuffer(StagingBufferBase* source,
                                                uint64_t sourceOffset,
                                                BufferBase* destination,
                                                uint64_t destinationOffset,
                                                uint64_t size) {
         // Metal validation layers forbid  0-sized copies, assert it is skipped prior to calling
         // this function.
         ASSERT(size != 0);

         ToBackend(destination)
             ->EnsureDataInitializedAsDestination(GetPendingCommandContext(), destinationOffset,
                                                  size);

         id<MTLBuffer> uploadBuffer = ToBackend(source)->GetBufferHandle();
         id<MTLBuffer> buffer = ToBackend(destination)->GetMTLBuffer();
         [GetPendingCommandContext()->EnsureBlit() copyFromBuffer:uploadBuffer
                                                     sourceOffset:sourceOffset
                                                         toBuffer:buffer
                                                destinationOffset:destinationOffset
                                                             size:size];
         return {};
     }

     // In Metal we don't write from the CPU to the texture directly which can be done using the
     // replaceRegion function, because the function requires a non-private storage mode and Dawn
     // sets the private storage mode by default for all textures except IOSurfaces on macOS.
     MaybeError Device::CopyFromStagingToTexture(const StagingBufferBase* source,
                                                 const TextureDataLayout& dataLayout,
                                                 TextureCopy* dst,
                                                 const Extent3D& copySizePixels) {
         Texture* texture = ToBackend(dst->texture.Get());
         EnsureDestinationTextureInitialized(GetPendingCommandContext(), texture, *dst,
                                             copySizePixels);

         RecordCopyBufferToTexture(GetPendingCommandContext(), ToBackend(source)->GetBufferHandle(),
                                   source->GetSize(), dataLayout.offset, dataLayout.bytesPerRow,
                                   dataLayout.rowsPerImage, texture, dst->mipLevel, dst->origin,
                                   dst->aspect, copySizePixels);
         return {};
     }

     Ref<Texture> Device::CreateTextureWrappingIOSurface(const ExternalImageDescriptor* descriptor,
                                                         IOSurfaceRef ioSurface,
                                                         uint32_t plane) {
         const TextureDescriptor* textureDescriptor =
             reinterpret_cast<const TextureDescriptor*>(descriptor->cTextureDescriptor);

         if (ConsumedError(ValidateTextureDescriptor(this, textureDescriptor))) {
             return nullptr;
         }
         if (ConsumedError(
                 ValidateIOSurfaceCanBeWrapped(this, textureDescriptor, ioSurface, plane))) {
             return nullptr;
         }

         Ref<Texture> result;
         if (ConsumedError(Texture::CreateFromIOSurface(this, descriptor, ioSurface, plane),
                           &result)) {
             return nullptr;
         }
         return result;
     }

     void Device::WaitForCommandsToBeScheduled() {
         if (ConsumedError(SubmitPendingCommandBuffer())) {
             return;
         }

         // Only lock the object while we take a reference to it, otherwise we could block further
         // progress if the driver calls the scheduled handler (which also acquires the lock) before
         // finishing the waitUntilScheduled.
         NSPRef<id<MTLCommandBuffer>> lastSubmittedCommands;
         {
             std::lock_guard<std::mutex> lock(mLastSubmittedCommandsMutex);
             lastSubmittedCommands = mLastSubmittedCommands;
         }
         [*lastSubmittedCommands waitUntilScheduled];
     }

     MaybeError Device::WaitForIdleForDestruction() {
         // Forget all pending commands.
         mCommandContext.AcquireCommands();
         DAWN_TRY(CheckPassedSerials());

         // Wait for all commands to be finished so we can free resources
         while (GetCompletedCommandSerial() != GetLastSubmittedCommandSerial()) {
             usleep(100);
             DAWN_TRY(CheckPassedSerials());
         }

         return {};
     }

     void Device::DestroyImpl() {
         ASSERT(GetState() == State::Disconnected);

         // Forget all pending commands.
         mCommandContext.AcquireCommands();

         mCommandQueue = nullptr;
         mMtlDevice = nullptr;
     }

     uint32_t Device::GetOptimalBytesPerRowAlignment() const {
         return 1;
     }

     uint64_t Device::GetOptimalBufferToTextureCopyOffsetAlignment() const {
         return 1;
     }

     float Device::GetTimestampPeriodInNS() const {
         return mTimestampPeriod;
     }

 }}  // namespace dawn_native::metal
	// Copyright 2018 The Dawn Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "dawn_native/metal/DeviceMTL.h"

	#include "common/GPUInfo.h"
	#include "common/Platform.h"
	#include "dawn_native/BackendConnection.h"
	#include "dawn_native/BindGroupLayout.h"
	#include "dawn_native/Commands.h"
	#include "dawn_native/ErrorData.h"
	#include "dawn_native/metal/BindGroupLayoutMTL.h"
	#include "dawn_native/metal/BindGroupMTL.h"
	#include "dawn_native/metal/BufferMTL.h"
	#include "dawn_native/metal/CommandBufferMTL.h"
	#include "dawn_native/metal/ComputePipelineMTL.h"
	#include "dawn_native/metal/PipelineLayoutMTL.h"
	#include "dawn_native/metal/QuerySetMTL.h"
	#include "dawn_native/metal/QueueMTL.h"
	#include "dawn_native/metal/RenderPipelineMTL.h"
	#include "dawn_native/metal/SamplerMTL.h"
	#include "dawn_native/metal/ShaderModuleMTL.h"
	#include "dawn_native/metal/StagingBufferMTL.h"
	#include "dawn_native/metal/SwapChainMTL.h"
	#include "dawn_native/metal/TextureMTL.h"
	#include "dawn_native/metal/UtilsMetal.h"
	#include "dawn_platform/DawnPlatform.h"
	#include "dawn_platform/tracing/TraceEvent.h"

	#include <type_traits>

	namespace dawn_native { namespace metal {

	namespace {

	// The time interval for each round of kalman filter
	static constexpr uint64_t kFilterIntervalInMs = static_cast<uint64_t>(NSEC_PER_SEC / 10);

	struct KalmanInfo {
	float filterValue; // The estimation value
	float kalmanGain; // The kalman gain
	float R; // The covariance of the observation noise
	float P; // The a posteriori estimate covariance
	};

	// A simplified kalman filter for estimating timestamp period based on measured values
	float KalmanFilter(KalmanInfo* info, float measuredValue) {
	// Optimize kalman gain
	info->kalmanGain = info->P / (info->P + info->R);

	// Correct filter value
	info->filterValue =
	info->kalmanGain * measuredValue + (1.0 - info->kalmanGain) * info->filterValue;
	// Update estimate covariance
	info->P = (1.0f - info->kalmanGain) * info->P;
	return info->filterValue;
	}

	void API_AVAILABLE(macos(10.15), ios(14))
	UpdateTimestampPeriod(id<MTLDevice> device,
	KalmanInfo* info,
	MTLTimestamp* cpuTimestampStart,
	MTLTimestamp* gpuTimestampStart,
	float* timestampPeriod) {
	// The filter value is converged to an optimal value when the kalman gain is less than
	// 0.01. At this time, the weight of the measured value is too small to change the next
	// filter value, the sampling and calculations do not need to continue anymore.
	if (info->kalmanGain < 0.01f) {
	return;
	}

	MTLTimestamp cpuTimestampEnd = 0, gpuTimestampEnd = 0;
	[device sampleTimestamps:&cpuTimestampEnd gpuTimestamp:&gpuTimestampEnd];

	// Update the timestamp start values when timestamp reset happens
	if (cpuTimestampEnd < cpuTimestampStart \|\| gpuTimestampEnd < gpuTimestampStart) {
	*cpuTimestampStart = cpuTimestampEnd;
	*gpuTimestampStart = gpuTimestampEnd;
	return;
	}

	if (cpuTimestampEnd - *cpuTimestampStart >= kFilterIntervalInMs) {
	// The measured timestamp period
	float measurement = (cpuTimestampEnd - *cpuTimestampStart) /
	static_cast<float>(gpuTimestampEnd - *gpuTimestampStart);

	// Measurement update
	*timestampPeriod = KalmanFilter(info, measurement);

	*cpuTimestampStart = cpuTimestampEnd;
	*gpuTimestampStart = gpuTimestampEnd;
	}
	}

	} // namespace

	// static
	ResultOrError<Device> Device::Create(AdapterBase adapter,
	NSPRef<id<MTLDevice>> mtlDevice,
	const DawnDeviceDescriptor* descriptor) {
	Ref<Device> device = AcquireRef(new Device(adapter, std::move(mtlDevice), descriptor));
	DAWN_TRY(device->Initialize());
	return device.Detach();
	}

	Device::Device(AdapterBase* adapter,
	NSPRef<id<MTLDevice>> mtlDevice,
	const DawnDeviceDescriptor* descriptor)
	: DeviceBase(adapter, descriptor), mMtlDevice(std::move(mtlDevice)), mCompletedSerial(0) {
	}

	Device::~Device() {
	Destroy();
	}

	MaybeError Device::Initialize() {
	InitTogglesFromDriver();

	mCommandQueue.Acquire([*mMtlDevice newCommandQueue]);
	if (mCommandQueue == nil) {
	return DAWN_INTERNAL_ERROR("Failed to allocate MTLCommandQueue.");
	}

	DAWN_TRY(mCommandContext.PrepareNextCommandBuffer(*mCommandQueue));

	if (IsFeatureEnabled(Feature::TimestampQuery)) {
	// Make a best guess of timestamp period based on device vendor info, and converge it to
	// an accurate value by the following calculations.
	mTimestampPeriod =
	gpu_info::IsIntel(GetAdapter()->GetPCIInfo().vendorId) ? 83.333f : 1.0f;

	// Initialize kalman filter parameters
	mKalmanInfo = std::make_unique<KalmanInfo>();
	mKalmanInfo->filterValue = 0.0f;
	mKalmanInfo->kalmanGain = 0.5f;
	mKalmanInfo->R =
	0.0001f; // The smaller this value is, the smaller the error of measured value is,
	// the more we can trust the measured value.
	mKalmanInfo->P = 1.0f;

	if (@available(macos 10.15, iOS 14.0, *)) {
	// Sample CPU timestamp and GPU timestamp for first time at device creation
	[*mMtlDevice sampleTimestamps:&mCpuTimestamp gpuTimestamp:&mGpuTimestamp];
	}
	}

	return DeviceBase::Initialize(new Queue(this));
	}

	void Device::InitTogglesFromDriver() {
	{
	bool haveStoreAndMSAAResolve = false;
	#if defined(DAWN_PLATFORM_MACOS)
	if (@available(macOS 10.12, *)) {
	haveStoreAndMSAAResolve =
	[*mMtlDevice supportsFeatureSet:MTLFeatureSet_macOS_GPUFamily1_v2];
	}
	#elif defined(DAWN_PLATFORM_IOS)
	haveStoreAndMSAAResolve =
	[*mMtlDevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v2];
	#endif
	// On tvOS, we would need MTLFeatureSet_tvOS_GPUFamily2_v1.
	SetToggle(Toggle::EmulateStoreAndMSAAResolve, !haveStoreAndMSAAResolve);

	bool haveSamplerCompare = true;
	#if defined(DAWN_PLATFORM_IOS)
	haveSamplerCompare = [*mMtlDevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v1];
	#endif
	// TODO(crbug.com/dawn/342): Investigate emulation -- possibly expensive.
	SetToggle(Toggle::MetalDisableSamplerCompare, !haveSamplerCompare);

	bool haveBaseVertexBaseInstance = true;
	#if defined(DAWN_PLATFORM_IOS)
	haveBaseVertexBaseInstance =
	[*mMtlDevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v1];
	#endif
	// TODO(crbug.com/dawn/343): Investigate emulation.
	SetToggle(Toggle::DisableBaseVertex, !haveBaseVertexBaseInstance);
	SetToggle(Toggle::DisableBaseInstance, !haveBaseVertexBaseInstance);
	}

	// Vertex buffer robustness is implemented by using programmable vertex pulling. Enable
	// that code path if it isn't explicitly disabled.
	if (IsRobustnessEnabled()) {
	SetToggle(Toggle::MetalEnableVertexPulling, true);
	}

	// TODO(crbug.com/dawn/846): tighten this workaround when the driver bug is fixed.
	SetToggle(Toggle::AlwaysResolveIntoZeroLevelAndLayer, true);

	const PCIInfo& pciInfo = GetAdapter()->GetPCIInfo();

	// TODO(crbug.com/dawn/847): Use MTLStorageModeShared instead of MTLStorageModePrivate when
	// creating MTLCounterSampleBuffer in QuerySet on Intel platforms, otherwise it fails to
	// create the buffer. Change to use MTLStorageModePrivate when the bug is fixed.
	if (@available(macOS 10.15, iOS 14.0, *)) {
	bool useSharedMode = gpu_info::IsIntel(pciInfo.vendorId);
	SetToggle(Toggle::MetalUseSharedModeForCounterSampleBuffer, useSharedMode);
	}

	// TODO(crbug.com/dawn/1071): r8unorm and rg8unorm textures with multiple mip levels don't
	// clear properly on Intel Macs.
	if (gpu_info::IsIntel(pciInfo.vendorId)) {
	SetToggle(Toggle::DisableR8RG8Mipmaps, true);
	}

	// On some Intel GPU vertex only render pipeline get wrong depth result if no fragment
	// shader provided. Create a dummy fragment shader module to work around this issue.
	if (gpu_info::IsIntel(this->GetAdapter()->GetPCIInfo().vendorId)) {
	bool useDummyFragmentShader = true;
	if (gpu_info::IsSkylake(this->GetAdapter()->GetPCIInfo().deviceId)) {
	useDummyFragmentShader = false;
	}
	SetToggle(Toggle::UseDummyFragmentInVertexOnlyPipeline, useDummyFragmentShader);
	}
	}

	ResultOrError<Ref<BindGroupBase>> Device::CreateBindGroupImpl(
	const BindGroupDescriptor* descriptor) {
	return BindGroup::Create(this, descriptor);
	}
	ResultOrError<Ref<BindGroupLayoutBase>> Device::CreateBindGroupLayoutImpl(
	const BindGroupLayoutDescriptor* descriptor,
	PipelineCompatibilityToken pipelineCompatibilityToken) {
	return BindGroupLayout::Create(this, descriptor, pipelineCompatibilityToken);
	}
	ResultOrError<Ref<BufferBase>> Device::CreateBufferImpl(const BufferDescriptor* descriptor) {
	return Buffer::Create(this, descriptor);
	}
	ResultOrError<Ref<CommandBufferBase>> Device::CreateCommandBuffer(
	CommandEncoder* encoder,
	const CommandBufferDescriptor* descriptor) {
	return CommandBuffer::Create(encoder, descriptor);
	}
	Ref<ComputePipelineBase> Device::CreateUninitializedComputePipelineImpl(
	const ComputePipelineDescriptor* descriptor) {
	return ComputePipeline::CreateUninitialized(this, descriptor);
	}
	ResultOrError<Ref<PipelineLayoutBase>> Device::CreatePipelineLayoutImpl(
	const PipelineLayoutDescriptor* descriptor) {
	return PipelineLayout::Create(this, descriptor);
	}
	ResultOrError<Ref<QuerySetBase>> Device::CreateQuerySetImpl(
	const QuerySetDescriptor* descriptor) {
	return QuerySet::Create(this, descriptor);
	}
	Ref<RenderPipelineBase> Device::CreateUninitializedRenderPipelineImpl(
	const RenderPipelineDescriptor* descriptor) {
	return RenderPipeline::CreateUninitialized(this, descriptor);
	}
	ResultOrError<Ref<SamplerBase>> Device::CreateSamplerImpl(const SamplerDescriptor* descriptor) {
	return Sampler::Create(this, descriptor);
	}
	ResultOrError<Ref<ShaderModuleBase>> Device::CreateShaderModuleImpl(
	const ShaderModuleDescriptor* descriptor,
	ShaderModuleParseResult* parseResult) {
	return ShaderModule::Create(this, descriptor, parseResult);
	}
	ResultOrError<Ref<SwapChainBase>> Device::CreateSwapChainImpl(
	const SwapChainDescriptor* descriptor) {
	return OldSwapChain::Create(this, descriptor);
	}
	ResultOrError<Ref<NewSwapChainBase>> Device::CreateSwapChainImpl(
	Surface* surface,
	NewSwapChainBase* previousSwapChain,
	const SwapChainDescriptor* descriptor) {
	return SwapChain::Create(this, surface, previousSwapChain, descriptor);
	}
	ResultOrError<Ref<TextureBase>> Device::CreateTextureImpl(const TextureDescriptor* descriptor) {
	return Texture::Create(this, descriptor);
	}
	ResultOrError<Ref<TextureViewBase>> Device::CreateTextureViewImpl(
	TextureBase* texture,
	const TextureViewDescriptor* descriptor) {
	return TextureView::Create(texture, descriptor);
	}
	void Device::InitializeComputePipelineAsyncImpl(Ref<ComputePipelineBase> computePipeline,
	WGPUCreateComputePipelineAsyncCallback callback,
	void* userdata) {
	ComputePipeline::InitializeAsync(std::move(computePipeline), callback, userdata);
	}
	void Device::InitializeRenderPipelineAsyncImpl(Ref<RenderPipelineBase> renderPipeline,
	WGPUCreateRenderPipelineAsyncCallback callback,
	void* userdata) {
	RenderPipeline::InitializeAsync(std::move(renderPipeline), callback, userdata);
	}

	ResultOrError<ExecutionSerial> Device::CheckAndUpdateCompletedSerials() {
	uint64_t frontendCompletedSerial{GetCompletedCommandSerial()};
	if (frontendCompletedSerial > mCompletedSerial) {
	// sometimes we increase the serials, in which case the completed serial in
	// the device base will surpass the completed serial we have in the metal backend, so we
	// must update ours when we see that the completed serial from device base has
	// increased.
	mCompletedSerial = frontendCompletedSerial;
	}
	return ExecutionSerial(mCompletedSerial.load());
	}

	MaybeError Device::TickImpl() {
	DAWN_TRY(SubmitPendingCommandBuffer());

	// Just run timestamp period calculation when timestamp feature is enabled.
	if (IsFeatureEnabled(Feature::TimestampQuery)) {
	if (@available(macos 10.15, iOS 14.0, *)) {
	UpdateTimestampPeriod(GetMTLDevice(), mKalmanInfo.get(), &mCpuTimestamp,
	&mGpuTimestamp, &mTimestampPeriod);
	}
	}

	return {};
	}

	id<MTLDevice> Device::GetMTLDevice() {
	return mMtlDevice.Get();
	}

	id<MTLCommandQueue> Device::GetMTLQueue() {
	return mCommandQueue.Get();
	}

	CommandRecordingContext* Device::GetPendingCommandContext() {
	mCommandContext.MarkUsed();
	return &mCommandContext;
	}

	MaybeError Device::SubmitPendingCommandBuffer() {
	if (!mCommandContext.WasUsed()) {
	return {};
	}

	IncrementLastSubmittedCommandSerial();

	// Acquire the pending command buffer, which is retained. It must be released later.
	NSPRef<id<MTLCommandBuffer>> pendingCommands = mCommandContext.AcquireCommands();

	// Replace mLastSubmittedCommands with the mutex held so we avoid races between the
	// schedule handler and this code.
	{
	std::lock_guard<std::mutex> lock(mLastSubmittedCommandsMutex);
	mLastSubmittedCommands = pendingCommands;
	}

	// Make a local copy of the pointer to the commands because it's not clear how ObjC blocks
	// handle types with copy / move constructors being referenced in the block..
	id<MTLCommandBuffer> pendingCommandsPointer = pendingCommands.Get();
	[*pendingCommands addScheduledHandler:^(id<MTLCommandBuffer>) {
	// This is DRF because we hold the mutex for mLastSubmittedCommands and pendingCommands
	// is a local value (and not the member itself).
	std::lock_guard<std::mutex> lock(mLastSubmittedCommandsMutex);
	if (this->mLastSubmittedCommands.Get() == pendingCommandsPointer) {
	this->mLastSubmittedCommands = nullptr;
	}
	}];

	// Update the completed serial once the completed handler is fired. Make a local copy of
	// mLastSubmittedSerial so it is captured by value.
	ExecutionSerial pendingSerial = GetLastSubmittedCommandSerial();
	// this ObjC block runs on a different thread
	[*pendingCommands addCompletedHandler:^(id<MTLCommandBuffer>) {
	TRACE_EVENT_ASYNC_END0(GetPlatform(), GPUWork, "DeviceMTL::SubmitPendingCommandBuffer",
	uint64_t(pendingSerial));
	ASSERT(uint64_t(pendingSerial) > mCompletedSerial.load());
	this->mCompletedSerial = uint64_t(pendingSerial);
	}];

	TRACE_EVENT_ASYNC_BEGIN0(GetPlatform(), GPUWork, "DeviceMTL::SubmitPendingCommandBuffer",
	uint64_t(pendingSerial));
	[*pendingCommands commit];

	return mCommandContext.PrepareNextCommandBuffer(*mCommandQueue);
	}

	ResultOrError<std::unique_ptr<StagingBufferBase>> Device::CreateStagingBuffer(size_t size) {
	std::unique_ptr<StagingBufferBase> stagingBuffer =
	std::make_unique<StagingBuffer>(size, this);
	DAWN_TRY(stagingBuffer->Initialize());
	return std::move(stagingBuffer);
	}

	MaybeError Device::CopyFromStagingToBuffer(StagingBufferBase* source,
	uint64_t sourceOffset,
	BufferBase* destination,
	uint64_t destinationOffset,
	uint64_t size) {
	// Metal validation layers forbid 0-sized copies, assert it is skipped prior to calling
	// this function.
	ASSERT(size != 0);

	ToBackend(destination)
	->EnsureDataInitializedAsDestination(GetPendingCommandContext(), destinationOffset,
	size);

	id<MTLBuffer> uploadBuffer = ToBackend(source)->GetBufferHandle();
	id<MTLBuffer> buffer = ToBackend(destination)->GetMTLBuffer();
	[GetPendingCommandContext()->EnsureBlit() copyFromBuffer:uploadBuffer
	sourceOffset:sourceOffset
	toBuffer:buffer
	destinationOffset:destinationOffset
	size:size];
	return {};
	}

	// In Metal we don't write from the CPU to the texture directly which can be done using the
	// replaceRegion function, because the function requires a non-private storage mode and Dawn
	// sets the private storage mode by default for all textures except IOSurfaces on macOS.
	MaybeError Device::CopyFromStagingToTexture(const StagingBufferBase* source,
	const TextureDataLayout& dataLayout,
	TextureCopy* dst,
	const Extent3D& copySizePixels) {
	Texture* texture = ToBackend(dst->texture.Get());
	EnsureDestinationTextureInitialized(GetPendingCommandContext(), texture, *dst,
	copySizePixels);

	RecordCopyBufferToTexture(GetPendingCommandContext(), ToBackend(source)->GetBufferHandle(),
	source->GetSize(), dataLayout.offset, dataLayout.bytesPerRow,
	dataLayout.rowsPerImage, texture, dst->mipLevel, dst->origin,
	dst->aspect, copySizePixels);
	return {};
	}

	Ref<Texture> Device::CreateTextureWrappingIOSurface(const ExternalImageDescriptor* descriptor,
	IOSurfaceRef ioSurface,
	uint32_t plane) {
	const TextureDescriptor* textureDescriptor =
	reinterpret_cast<const TextureDescriptor*>(descriptor->cTextureDescriptor);

	if (ConsumedError(ValidateTextureDescriptor(this, textureDescriptor))) {
	return nullptr;
	}
	if (ConsumedError(
	ValidateIOSurfaceCanBeWrapped(this, textureDescriptor, ioSurface, plane))) {
	return nullptr;
	}

	Ref<Texture> result;
	if (ConsumedError(Texture::CreateFromIOSurface(this, descriptor, ioSurface, plane),
	&result)) {
	return nullptr;
	}
	return result;
	}

	void Device::WaitForCommandsToBeScheduled() {
	if (ConsumedError(SubmitPendingCommandBuffer())) {
	return;
	}

	// Only lock the object while we take a reference to it, otherwise we could block further
	// progress if the driver calls the scheduled handler (which also acquires the lock) before
	// finishing the waitUntilScheduled.
	NSPRef<id<MTLCommandBuffer>> lastSubmittedCommands;
	{
	std::lock_guard<std::mutex> lock(mLastSubmittedCommandsMutex);
	lastSubmittedCommands = mLastSubmittedCommands;
	}
	[*lastSubmittedCommands waitUntilScheduled];
	}

	MaybeError Device::WaitForIdleForDestruction() {
	// Forget all pending commands.
	mCommandContext.AcquireCommands();
	DAWN_TRY(CheckPassedSerials());

	// Wait for all commands to be finished so we can free resources
	while (GetCompletedCommandSerial() != GetLastSubmittedCommandSerial()) {
	usleep(100);
	DAWN_TRY(CheckPassedSerials());
	}

	return {};
	}

	void Device::DestroyImpl() {
	ASSERT(GetState() == State::Disconnected);

	// Forget all pending commands.
	mCommandContext.AcquireCommands();

	mCommandQueue = nullptr;
	mMtlDevice = nullptr;
	}

	uint32_t Device::GetOptimalBytesPerRowAlignment() const {
	return 1;
	}

	uint64_t Device::GetOptimalBufferToTextureCopyOffsetAlignment() const {
	return 1;
	}

	float Device::GetTimestampPeriodInNS() const {
	return mTimestampPeriod;
	}

	}} // namespace dawn_native::metal