diff --git a/.claude/settings.local.json b/.claude/settings.local.json index e26caa5..d185b21 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -1,7 +1,17 @@ { "permissions": { "allow": [ - "Bash(cat:*)" + "Bash(cat:*)", + "Bash(git checkout:*)", + "Bash(git add:*)", + "Bash(git commit:*)", + "Bash(git push:*)", + "Bash(gh pr create:*)", + "Bash(find:*)", + "Bash(ls:*)", + "Bash(grep:*)", + "Bash(mkdir:*)", + "Bash(mv:*)" ], "deny": [] } diff --git a/DepthCamera.xcodeproj/project.pbxproj b/DepthCamera.xcodeproj/project.pbxproj index c60060e..ca45958 100644 --- a/DepthCamera.xcodeproj/project.pbxproj +++ b/DepthCamera.xcodeproj/project.pbxproj @@ -27,6 +27,10 @@ 89B6173A2B14740B00280111 /* DepthCameraUITests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 89B617392B14740B00280111 /* DepthCameraUITests.swift */; }; 89B6173C2B14740B00280111 /* DepthCameraUITestsLaunchTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 89B6173B2B14740B00280111 /* DepthCameraUITestsLaunchTests.swift */; }; 89DE42E22CB3DECF00D43C0A /* color 2.glb in Resources */ = {isa = PBXBuildFile; fileRef = 89DE42E12CB3DECF00D43C0A /* color 2.glb */; }; + 89E4CE382E0268FF003F4665 /* model_fp16.onnx in Resources */ = {isa = PBXBuildFile; fileRef = 89E4CE372E0268FF003F4665 /* model_fp16.onnx */; }; + 89E4CE3A2E026908003F4665 /* PromptDADepthEstimator.swift in Sources */ = {isa = PBXBuildFile; fileRef = 89E4CE392E026908003F4665 /* PromptDADepthEstimator.swift */; }; + 89E4CE3D2E026963003F4665 /* onnxruntime in Frameworks */ = {isa = PBXBuildFile; productRef = 89E4CE3C2E026963003F4665 /* onnxruntime */; }; + 89E4CE3F2E026963003F4665 /* onnxruntime_extensions in Frameworks */ = {isa = PBXBuildFile; productRef = 89E4CE3E2E026963003F4665 /* onnxruntime_extensions */; }; AF6D6D435FA0400B15234CB6 /* Pods_DepthCamera_DepthCameraUITests.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8F9F679CF8B7EE44C3DAACAC /* Pods_DepthCamera_DepthCameraUITests.framework */; }; DB0AE426B16D9FA02B997FED /* Pods_DepthCamera.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = E227FAB191BC5550BAC2DB4A /* Pods_DepthCamera.framework */; }; /* End PBXBuildFile section */ @@ -77,6 +81,8 @@ 89B6174A2B14802900280111 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist; path = Info.plist; sourceTree = ""; }; 89B8A9712BB2D33D00FD7106 /* Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "Bridging-Header.h"; sourceTree = ""; }; 89DE42E12CB3DECF00D43C0A /* color 2.glb */ = {isa = PBXFileReference; lastKnownFileType = file; path = "color 2.glb"; sourceTree = ""; }; + 89E4CE372E0268FF003F4665 /* model_fp16.onnx */ = {isa = PBXFileReference; lastKnownFileType = file; path = model_fp16.onnx; sourceTree = ""; }; + 89E4CE392E026908003F4665 /* PromptDADepthEstimator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PromptDADepthEstimator.swift; sourceTree = ""; }; 8F9F679CF8B7EE44C3DAACAC /* Pods_DepthCamera_DepthCameraUITests.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_DepthCamera_DepthCameraUITests.framework; sourceTree = BUILT_PRODUCTS_DIR; }; AC19CA33E5E5796BB47E68BF /* Pods-DepthCameraTests.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-DepthCameraTests.release.xcconfig"; path = "Target Support Files/Pods-DepthCameraTests/Pods-DepthCameraTests.release.xcconfig"; sourceTree = ""; }; B7D605FFB229CC868C730AE8 /* Pods-DepthCamera-DepthCameraUITests.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-DepthCamera-DepthCameraUITests.release.xcconfig"; path = "Target Support Files/Pods-DepthCamera-DepthCameraUITests/Pods-DepthCamera-DepthCameraUITests.release.xcconfig"; sourceTree = ""; }; @@ -90,7 +96,9 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( + 89E4CE3D2E026963003F4665 /* onnxruntime in Frameworks */, DB0AE426B16D9FA02B997FED /* Pods_DepthCamera.framework in Frameworks */, + 89E4CE3F2E026963003F4665 /* onnxruntime_extensions in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -147,6 +155,8 @@ 8943625A2DF38CA900EC952D /* CaptureFileManager.swift */, 899DD70E2CF71376004FE108 /* CaptureButton.swift */, 8943625E2DF3919500EC952D /* DepthMapDetailView.swift */, + 89E4CE372E0268FF003F4665 /* model_fp16.onnx */, + 89E4CE392E026908003F4665 /* PromptDADepthEstimator.swift */, 899DD70C2CF7135D004FE108 /* DocumentPicker.swift */, 899DD70A2CF71336004FE108 /* ThumbnailView.swift */, 8943625C2DF38CBA00EC952D /* CaptureListView.swift */, @@ -302,6 +312,9 @@ Base, ); mainGroup = 89B617112B14740A00280111; + packageReferences = ( + 89E4CE3B2E026963003F4665 /* XCRemoteSwiftPackageReference "onnxruntime-swift-package-manager" */, + ); productRefGroup = 89B6171B2B14740A00280111 /* Products */; projectDirPath = ""; projectRoot = ""; @@ -319,6 +332,7 @@ buildActionMask = 2147483647; files = ( 89DE42E22CB3DECF00D43C0A /* color 2.glb in Resources */, + 89E4CE382E0268FF003F4665 /* model_fp16.onnx in Resources */, 89B617262B14740B00280111 /* Preview Assets.xcassets in Resources */, 89B617222B14740B00280111 /* Assets.xcassets in Resources */, ); @@ -459,6 +473,7 @@ 899DD70B2CF71336004FE108 /* ThumbnailView.swift in Sources */, 899DD70F2CF71376004FE108 /* CaptureButton.swift in Sources */, 89B6171E2B14740A00280111 /* DepthCameraApp.swift in Sources */, + 89E4CE3A2E026908003F4665 /* PromptDADepthEstimator.swift in Sources */, 8943625D2DF38CBA00EC952D /* CaptureListView.swift in Sources */, 8943625F2DF3919500EC952D /* DepthMapDetailView.swift in Sources */, ); @@ -843,6 +858,30 @@ defaultConfigurationName = Release; }; /* End XCConfigurationList section */ + +/* Begin XCRemoteSwiftPackageReference section */ + 89E4CE3B2E026963003F4665 /* XCRemoteSwiftPackageReference "onnxruntime-swift-package-manager" */ = { + isa = XCRemoteSwiftPackageReference; + repositoryURL = "https://github.com/microsoft/onnxruntime-swift-package-manager"; + requirement = { + kind = upToNextMajorVersion; + minimumVersion = 1.20.0; + }; + }; +/* End XCRemoteSwiftPackageReference section */ + +/* Begin XCSwiftPackageProductDependency section */ + 89E4CE3C2E026963003F4665 /* onnxruntime */ = { + isa = XCSwiftPackageProductDependency; + package = 89E4CE3B2E026963003F4665 /* XCRemoteSwiftPackageReference "onnxruntime-swift-package-manager" */; + productName = onnxruntime; + }; + 89E4CE3E2E026963003F4665 /* onnxruntime_extensions */ = { + isa = XCSwiftPackageProductDependency; + package = 89E4CE3B2E026963003F4665 /* XCRemoteSwiftPackageReference "onnxruntime-swift-package-manager" */; + productName = onnxruntime_extensions; + }; +/* End XCSwiftPackageProductDependency section */ }; rootObject = 89B617122B14740A00280111 /* Project object */; } diff --git a/DepthCamera.xcodeproj/xcshareddata/xcschemes/DepthCamera.xcscheme b/DepthCamera.xcodeproj/xcshareddata/xcschemes/DepthCamera.xcscheme new file mode 100644 index 0000000..a867f95 --- /dev/null +++ b/DepthCamera.xcodeproj/xcshareddata/xcschemes/DepthCamera.xcscheme @@ -0,0 +1,102 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/DepthCamera.xcodeproj/xcuserdata/iori.xcuserdatad/xcschemes/xcschememanagement.plist b/DepthCamera.xcodeproj/xcuserdata/iori.xcuserdatad/xcschemes/xcschememanagement.plist index 27f9fc5..47099d1 100644 --- a/DepthCamera.xcodeproj/xcuserdata/iori.xcuserdatad/xcschemes/xcschememanagement.plist +++ b/DepthCamera.xcodeproj/xcuserdata/iori.xcuserdatad/xcschemes/xcschememanagement.plist @@ -10,5 +10,23 @@ 4 + SuppressBuildableAutocreation + + 89B617192B14740A00280111 + + primary + + + 89B6172A2B14740B00280111 + + primary + + + 89B617342B14740B00280111 + + primary + + + diff --git a/DepthCamera.xcworkspace/xcshareddata/swiftpm/Package.resolved b/DepthCamera.xcworkspace/xcshareddata/swiftpm/Package.resolved new file mode 100644 index 0000000..c35c292 --- /dev/null +++ b/DepthCamera.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -0,0 +1,15 @@ +{ + "originHash" : "54a530a8a66c1ee0cc1c6dd34159a96243f09d436e10ae5178973082bff1ef28", + "pins" : [ + { + "identity" : "onnxruntime-swift-package-manager", + "kind" : "remoteSourceControl", + "location" : "https://github.com/microsoft/onnxruntime-swift-package-manager", + "state" : { + "revision" : "12ce7374c86944e1f68f3a866d10105d8357f074", + "version" : "1.20.0" + } + } + ], + "version" : 3 +} diff --git a/DepthCamera/ARViewModel.swift b/DepthCamera/ARViewModel.swift index 2a8078c..8c50e5d 100644 --- a/DepthCamera/ARViewModel.swift +++ b/DepthCamera/ARViewModel.swift @@ -23,35 +23,89 @@ class ARViewModel: NSObject, ARSessionDelegate, ObservableObject { } @Published var lastCaptureURL: URL? + // PromptDA depth estimation + private var depthEstimator: PromptDADepthEstimator? + @Published var usePromptDA: Bool = true + private var lastDepthUpdate: TimeInterval = 0 - private let depthUpdateInterval: TimeInterval = 0.1 // 10fps (1/10秒) + private let depthUpdateInterval: TimeInterval = 0.5 // 2fps for PromptDA preview + + override init() { + super.init() + // Initialize PromptDA + depthEstimator = PromptDADepthEstimator() + print("ARViewModel: PromptDA depth estimator initialized") + } func session(_ session: ARSession, didUpdate frame: ARFrame) { - latestDepthMap = frame.sceneDepth?.depthMap latestImage = frame.capturedImage let currentTime = CACurrentMediaTime() if currentTime - lastDepthUpdate >= depthUpdateInterval { lastDepthUpdate = currentTime // タイマーを更新 - // DepthMapの処理と表示 - if showDepthMap, let depthMap = frame.sceneDepth?.depthMap { - processDepthMap(depthMap) - } - - // ConfidenceMapの処理と表示 - if showConfidenceMap, let confidenceMap = frame.sceneDepth?.confidenceMap { - processConfidenceMap(confidenceMap) + // Use PromptDA for depth estimation if enabled + if usePromptDA, let estimator = depthEstimator { + if estimator.shouldProcessFrame(timestamp: frame.timestamp) { + print("ARViewModel: Using PromptDA for depth estimation") + + + // For preview, don't resize to save performance + if let promptDADepth = estimator.estimateDepth(from: frame.capturedImage, lidarDepth: frame.sceneDepth?.depthMap, resizeToOriginal: false) { + latestDepthMap = promptDADepth + print("ARViewModel: PromptDA depth estimation successful") + processDepthMap(promptDADepth) + + // Also create confidence map from PromptDA + if showConfidenceMap, let confMap = estimator.createConfidenceMap(from: promptDADepth) { + processConfidenceMap(confMap) + } + } else { + print("ARViewModel: PromptDA depth estimation failed, falling back to ARKit depth") + // Fall back to ARKit depth + latestDepthMap = frame.sceneDepth?.depthMap + if showDepthMap, let depthMap = frame.sceneDepth?.depthMap { + processDepthMap(depthMap) + } + if showConfidenceMap, let confidenceMap = frame.sceneDepth?.confidenceMap { + processConfidenceMap(confidenceMap) + } + } + } else { + // Not time to process with PromptDA, use ARKit depth + latestDepthMap = frame.sceneDepth?.depthMap + if showDepthMap, let depthMap = frame.sceneDepth?.depthMap { + processDepthMap(depthMap) + } + if showConfidenceMap, let confidenceMap = frame.sceneDepth?.confidenceMap { + processConfidenceMap(confidenceMap) + } + } + } else { + // PromptDA disabled or not initialized, use ARKit depth + latestDepthMap = frame.sceneDepth?.depthMap + if showDepthMap, let depthMap = frame.sceneDepth?.depthMap { + processDepthMap(depthMap) + } + if showConfidenceMap, let confidenceMap = frame.sceneDepth?.confidenceMap { + processConfidenceMap(confidenceMap) + } } } - } func saveDepthMap() { + // Simply use the current latestDepthMap that was used for preview guard let depthMap = latestDepthMap, let image = latestImage else { print("Depth map or image is not available.") return } + // デバッグ: 保存するdepthMapのサイズを確認 + let saveWidth = CVPixelBufferGetWidth(depthMap) + let saveHeight = CVPixelBufferGetHeight(depthMap) + let saveBytesPerRow = CVPixelBufferGetBytesPerRow(depthMap) + print("Saving depth map: \(saveWidth)x\(saveHeight), bytesPerRow=\(saveBytesPerRow)") + let documentsDir = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask).first! let dateFormatter = DateFormatter() dateFormatter.dateFormat = "yyyyMMdd" @@ -66,10 +120,13 @@ class ARViewModel: NSObject, ARSessionDelegate, ObservableObject { } let timestamp = Date().timeIntervalSince1970 - let depthFileURL = dateDirURL.appendingPathComponent("\(timestamp)_depth.tiff") + let depthFileURL = dateDirURL.appendingPathComponent("\(timestamp)_depth.png") + let depthTiffURL = dateDirURL.appendingPathComponent("\(timestamp)_depth.tiff") let imageFileURL = dateDirURL.appendingPathComponent("\(timestamp)_image.jpg") - writeDepthMapToTIFFWithLibTIFF(depthMap: depthMap, url: depthFileURL) + // Save as both PNG and TIFF for comparison + writeDepthMapTo16BitPNG(depthMap: depthMap, url: depthFileURL) + writeDepthMapToTIFFWithLibTIFF(depthMap: depthMap, url: depthTiffURL) saveImage(image: image, url: imageFileURL) @@ -93,6 +150,7 @@ class ARViewModel: NSObject, ARSessionDelegate, ObservableObject { print("Depth map saved to \(depthFileURL)") + print("Depth TIFF saved to \(depthTiffURL)") print("Image saved to \(imageFileURL)") } } @@ -142,15 +200,20 @@ extension ARViewModel { let width = CVPixelBufferGetWidth(depthMap) let height = CVPixelBufferGetHeight(depthMap) + let bytesPerRow = CVPixelBufferGetBytesPerRow(depthMap) var normalizedData = [UInt8](repeating: 0, count: width * height * 4) - let buffer = CVPixelBufferGetBaseAddress(depthMap)?.assumingMemoryBound(to: Float32.self) + let buffer = CVPixelBufferGetBaseAddress(depthMap) + let floatsPerRow = bytesPerRow / MemoryLayout.size for y in 0.. Bool { + let width = CVPixelBufferGetWidth(depthMap) + let height = CVPixelBufferGetHeight(depthMap) + + CVPixelBufferLockBaseAddress(depthMap, .readOnly) + defer { CVPixelBufferUnlockBaseAddress(depthMap, .readOnly) } + + guard let baseAddress = CVPixelBufferGetBaseAddress(depthMap) else { + return false + } + + let bytesPerRow = CVPixelBufferGetBytesPerRow(depthMap) + + // Create 16-bit grayscale image + var pixelData = [UInt16](repeating: 0, count: width * height) + + // Find min/max for normalization + var minDepth: Float = Float.greatestFiniteMagnitude + var maxDepth: Float = -Float.greatestFiniteMagnitude + + for y in 0.. 0 { + minDepth = min(minDepth, depth) + maxDepth = max(maxDepth, depth) + } + } + } + + // Fallback if no valid depths found + if minDepth == Float.greatestFiniteMagnitude { + minDepth = 0 + maxDepth = 5 + } + + print("Depth range: \(minDepth) - \(maxDepth) meters") + + // Convert to 16-bit - create a simple gradient for testing + let useTestPattern = false // Set to true for debugging + + for y in 0.. Bool { let width = CVPixelBufferGetWidth(depthMap) let height = CVPixelBufferGetHeight(depthMap) + let pixelFormat = CVPixelBufferGetPixelFormatType(depthMap) CVPixelBufferLockBaseAddress(depthMap, CVPixelBufferLockFlags(rawValue: 0)) guard let baseAddress = CVPixelBufferGetBaseAddress(depthMap) else { @@ -213,10 +353,10 @@ func writeDepthMapToTIFFWithLibTIFF(depthMap: CVPixelBuffer, url: URL) -> Bool { } for y in 0..(start: pixelBytes.assumingMemoryBound(to: Float.self), count: width) + let rowPtr = baseAddress.advanced(by: y * bytesPerRow).assumingMemoryBound(to: Float32.self) for x in 0.. CVPixelBuffer? { + // Initialize session on background queue if needed + if !sessionInitialized { + do { + try sessionQueue.sync { + try initializeSessionIfNeeded() + } + } catch { + print("PromptDADepthEstimator: Failed to initialize session: \(error)") + return nil + } + } + + guard let session = ortSession else { + print("PromptDADepthEstimator: Session not initialized") + return nil + } + + do { + // Prepare RGB input + let rgbTensor = try prepareRGBInput(rgbImage) + + // Prepare LiDAR prompt + let lidarTensor = try prepareLiDARInput(lidarDepth) + + // Run inference with correct input names + let inputs: [String: ORTValue] = [ + "pixel_values": rgbTensor, + "prompt_depth": lidarTensor + ] + + // First, get output names from the session + let outputNames = try session.outputNames() + print("PromptDADepthEstimator: Available output names: \(outputNames)") + + // Run inference - the API returns outputs automatically + let outputs = try session.run(withInputs: inputs, + outputNames: Set(outputNames), + runOptions: nil) + + // Debug: Print output names + print("PromptDADepthEstimator: Output names: \(outputs.keys)") + + // Get the first output (usually the depth map) + guard let depthOutput = outputs.values.first else { + print("PromptDADepthEstimator: No output from model") + return nil + } + + // Convert output to CVPixelBuffer + let depthBuffer = try convertToPixelBuffer(depthOutput) + + // Resize to original image size if requested + if resizeToOriginal { + let originalSize = CGSize(width: CVPixelBufferGetWidth(rgbImage), + height: CVPixelBufferGetHeight(rgbImage)) + let outputSize = CGSize(width: CVPixelBufferGetWidth(depthBuffer), + height: CVPixelBufferGetHeight(depthBuffer)) + + if originalSize != outputSize { + print("PromptDADepthEstimator: Resizing output from \(outputSize) to \(originalSize)") + return resizeDepthBuffer(depthBuffer, to: originalSize) + } + } + + return depthBuffer + + } catch { + print("PromptDADepthEstimator: Error during inference: \(error)") + return nil + } + } + + private func prepareRGBInput(_ pixelBuffer: CVPixelBuffer) throws -> ORTValue { + // Resize to model input size + let resizedBuffer = resizePixelBuffer(pixelBuffer, to: modelSize) + + // Convert to normalized float array with shape [1, 3, 192, 256] + let width = Int(modelSize.width) + let height = Int(modelSize.height) + var floatData = [Float](repeating: 0, count: 1 * 3 * height * width) + + CVPixelBufferLockBaseAddress(resizedBuffer, .readOnly) + defer { CVPixelBufferUnlockBaseAddress(resizedBuffer, .readOnly) } + + let baseAddress = CVPixelBufferGetBaseAddress(resizedBuffer)! + let bytesPerRow = CVPixelBufferGetBytesPerRow(resizedBuffer) + + // ImageNet normalization constants + let mean: [Float] = [0.485, 0.456, 0.406] + let std: [Float] = [0.229, 0.224, 0.225] + + // Convert BGRA to normalized RGB + for y in 0...size) + return try ORTValue(tensorData: tensorData, + elementType: .float, + shape: [NSNumber(value: 1), NSNumber(value: 3), NSNumber(value: height), NSNumber(value: width)]) + } + + private func prepareLiDARInput(_ lidarBuffer: CVPixelBuffer?) throws -> ORTValue { + let width = Int(modelSize.width) + let height = Int(modelSize.height) + var floatData = [Float](repeating: 0, count: 1 * 1 * height * width) + + if let lidar = lidarBuffer { + // Resize LiDAR depth to model size + let resizedLidar = resizeDepthBuffer(lidar, to: modelSize) + + CVPixelBufferLockBaseAddress(resizedLidar, .readOnly) + defer { CVPixelBufferUnlockBaseAddress(resizedLidar, .readOnly) } + + let baseAddress = CVPixelBufferGetBaseAddress(resizedLidar)! + let lidarWidth = CVPixelBufferGetWidth(resizedLidar) + let lidarHeight = CVPixelBufferGetHeight(resizedLidar) + let bytesPerRow = CVPixelBufferGetBytesPerRow(resizedLidar) + + // Copy depth values (assuming Float32 format) + for y in 0...size) + return try ORTValue(tensorData: tensorData, + elementType: .float, + shape: [NSNumber(value: 1), NSNumber(value: 1), NSNumber(value: height), NSNumber(value: width)]) + } + + private func convertToPixelBuffer(_ ortValue: ORTValue) throws -> CVPixelBuffer { + let tensorData = try ortValue.tensorData() as NSData + let tensorInfo = try ortValue.tensorTypeAndShapeInfo() + let shape = tensorInfo.shape + let shapeCount = shape.count + let shape0 = shape.count > 0 ? Int(shape[0]) : 0 + let shape1 = shape.count > 1 ? Int(shape[1]) : 0 + let shape2 = shape.count > 2 ? Int(shape[2]) : 0 + + print("PromptDADepthEstimator: Output tensor shape: \(shape)") + + // Handle different output shapes + let height: Int + let width: Int + let needsTranspose: Bool + + if shape.count == 4 { + // Shape is [batch, channels, height, width] + height = Int(shape[2]) + width = Int(shape[3]) + needsTranspose = false + } else if shape.count == 3 { + // Shape is [batch, height, width] or [channels, height, width] + if Int(shape[0]) == 1 { + // [1, height, width] - PromptDA outputs [1, 182, 252] + height = Int(shape[1]) // 182 + width = Int(shape[2]) // 252 + needsTranspose = false + } else { + // Assume [height, width, channels] + height = Int(shape[0]) + width = Int(shape[1]) + needsTranspose = false + } + } else if shape.count == 2 { + // Shape is [height, width] + height = Int(shape[0]) + width = Int(shape[1]) + needsTranspose = false + } else { + throw NSError(domain: "PromptDADepthEstimator", code: -3, + userInfo: [NSLocalizedDescriptionKey: "Unexpected output shape: \(shape)"]) + } + + print("PromptDADepthEstimator: Creating CVPixelBuffer with width=\(width), height=\(height)") + + // Create output pixel buffer + var pixelBuffer: CVPixelBuffer? + let attributes: [String: Any] = [ + kCVPixelBufferIOSurfacePropertiesKey as String: [:] as CFDictionary, + kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_DepthFloat32 + ] + + let status = CVPixelBufferCreate(kCFAllocatorDefault, + width, height, + kCVPixelFormatType_DepthFloat32, + attributes as CFDictionary, + &pixelBuffer) + + guard status == kCVReturnSuccess, let buffer = pixelBuffer else { + throw NSError(domain: "PromptDADepthEstimator", code: -2, + userInfo: [NSLocalizedDescriptionKey: "Failed to create output pixel buffer"]) + } + + CVPixelBufferLockBaseAddress(buffer, []) + defer { CVPixelBufferUnlockBaseAddress(buffer, []) } + + let destBaseAddress = CVPixelBufferGetBaseAddress(buffer)! + let destBytesPerRow = CVPixelBufferGetBytesPerRow(buffer) + let srcPtr = tensorData.bytes.assumingMemoryBound(to: Float32.self) + + + // Copy data row by row to handle potential padding + for y in 0...size) + } + + return buffer + } + + private func resizePixelBuffer(_ pixelBuffer: CVPixelBuffer, to size: CGSize) -> CVPixelBuffer { + let ciImage = CIImage(cvPixelBuffer: pixelBuffer) + let scaleX = size.width / CGFloat(CVPixelBufferGetWidth(pixelBuffer)) + let scaleY = size.height / CGFloat(CVPixelBufferGetHeight(pixelBuffer)) + let scaledImage = ciImage.transformed(by: CGAffineTransform(scaleX: scaleX, y: scaleY)) + + var resizedBuffer: CVPixelBuffer? + CVPixelBufferCreate(kCFAllocatorDefault, + Int(size.width), Int(size.height), + CVPixelBufferGetPixelFormatType(pixelBuffer), + nil, + &resizedBuffer) + + if let buffer = resizedBuffer { + context.render(scaledImage, to: buffer) + } + + return resizedBuffer! + } + + private func resizeDepthBuffer(_ depthBuffer: CVPixelBuffer, to size: CGSize) -> CVPixelBuffer { + let srcWidth = CVPixelBufferGetWidth(depthBuffer) + let srcHeight = CVPixelBufferGetHeight(depthBuffer) + let dstWidth = Int(size.width) + let dstHeight = Int(size.height) + + // Create destination buffer + var dstBuffer: CVPixelBuffer? + let attributes: [String: Any] = [ + kCVPixelBufferIOSurfacePropertiesKey as String: [:] as CFDictionary, + kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_DepthFloat32 + ] + + CVPixelBufferCreate(kCFAllocatorDefault, + dstWidth, dstHeight, + kCVPixelFormatType_DepthFloat32, + attributes as CFDictionary, + &dstBuffer) + + guard let destBuffer = dstBuffer else { return depthBuffer } + + CVPixelBufferLockBaseAddress(depthBuffer, .readOnly) + CVPixelBufferLockBaseAddress(destBuffer, []) + defer { + CVPixelBufferUnlockBaseAddress(depthBuffer, .readOnly) + CVPixelBufferUnlockBaseAddress(destBuffer, []) + } + + let srcPtr = CVPixelBufferGetBaseAddress(depthBuffer)!.assumingMemoryBound(to: Float32.self) + let dstPtr = CVPixelBufferGetBaseAddress(destBuffer)!.assumingMemoryBound(to: Float32.self) + + // Simple nearest neighbor resize + for y in 0.. CVPixelBuffer? { + let width = CVPixelBufferGetWidth(depthMap) + let height = CVPixelBufferGetHeight(depthMap) + + var confidenceMap: CVPixelBuffer? + let attributes: [String: Any] = [ + kCVPixelBufferCGImageCompatibilityKey as String: kCFBooleanTrue!, + kCVPixelBufferCGBitmapContextCompatibilityKey as String: kCFBooleanTrue! + ] + + let status = CVPixelBufferCreate(kCFAllocatorDefault, + width, height, + kCVPixelFormatType_OneComponent8, + attributes as CFDictionary, + &confidenceMap) + + guard status == kCVReturnSuccess, let buffer = confidenceMap else { + return nil + } + + CVPixelBufferLockBaseAddress(buffer, []) + defer { CVPixelBufferUnlockBaseAddress(buffer, []) } + + let baseAddress = CVPixelBufferGetBaseAddress(buffer)! + let bytesPerRow = CVPixelBufferGetBytesPerRow(buffer) + + // Set all pixels to maximum confidence (255) + for y in 0.. Bool { + // Always process first frame + if lastProcessedTimestamp < 0 { + lastProcessedTimestamp = timestamp + print("PromptDADepthEstimator: Processing first frame!") + return true + } + + let timeDiff = timestamp - lastProcessedTimestamp + print("PromptDADepthEstimator: timestamp=\(timestamp), lastProcessed=\(lastProcessedTimestamp), diff=\(timeDiff), interval=\(processingInterval)") + if timeDiff >= processingInterval { + lastProcessedTimestamp = timestamp + print("PromptDADepthEstimator: Processing frame!") + return true + } + print("PromptDADepthEstimator: Skipping frame (too soon)") + return false + } +} diff --git a/DepthCamera/model_fp16.onnx b/DepthCamera/model_fp16.onnx new file mode 100644 index 0000000..1a3f56a Binary files /dev/null and b/DepthCamera/model_fp16.onnx differ diff --git a/README_PromptDA.md b/README_PromptDA.md new file mode 100644 index 0000000..e734830 --- /dev/null +++ b/README_PromptDA.md @@ -0,0 +1,58 @@ +# PromptDA Integration for DepthCamera + +This document describes how to integrate PromptDA depth estimation into the DepthCamera project. + +## Setup Instructions + +### 1. Add ONNX Runtime via Swift Package Manager + +1. Open `DepthCamera.xcworkspace` in Xcode +2. Go to File → Add Package Dependencies +3. Add the ONNX Runtime package: + - URL: `https://github.com/microsoft/onnxruntime-swift-package-manager` + - Version: 1.20.0 or latest + +### 2. Add Files to Xcode Project + +The following files have been added to the project: +- `PromptDADepthEstimator.swift` - The depth estimation class +- `model_fp16.onnx` - The PromptDA model file + +Make sure to: +1. Add both files to the Xcode project +2. Ensure `model_fp16.onnx` is included in the app bundle (check Target Membership) + +### 3. Build and Run + +1. Select a real device with LiDAR (iPhone 12 Pro or later, iPad Pro with LiDAR) +2. Build and run the project +3. Use the "PromptDA" toggle button to switch between ARKit depth and PromptDA depth estimation + +## Usage + +The app now has three toggle buttons at the top: +- **Depth**: Show/hide depth visualization +- **PromptDA**: Enable/disable PromptDA depth estimation (purple button) +- **Confidence**: Show/hide confidence map + +When PromptDA is enabled: +- The app will use the PromptDA model to estimate depth from RGB images +- LiDAR data is used as a sparse depth prompt to improve accuracy +- Processing is limited to 2 FPS to maintain performance + +## Debugging + +Look for these log messages: +- `ARViewModel: PromptDA depth estimator initialized` +- `ARViewModel: Using PromptDA for depth estimation` +- `ARViewModel: PromptDA depth estimation successful` +- `PromptDADepthEstimator: Model input names: [...]` +- `PromptDADepthEstimator: Processing first frame!` + +## Troubleshooting + +If depth estimation isn't working: +1. Check that the ONNX model file is included in the bundle +2. Look for error messages about invalid input names +3. Verify that the device has enough memory to run the model +4. Try reducing the frame rate by modifying `processingInterval` in PromptDADepthEstimator.swift \ No newline at end of file diff --git a/docs/depth-map-processing-issue-report.md b/docs/depth-map-processing-issue-report.md new file mode 100644 index 0000000..ab235dc --- /dev/null +++ b/docs/depth-map-processing-issue-report.md @@ -0,0 +1,156 @@ +# Depth Map Processing Issue Report + +## 概要 + +本レポートは、DepthCameraアプリケーションにおいて発生した深度マップの処理と保存に関する問題の調査・解決過程を記録したものです。 + +**発生日**: 2025年6月23日 +**影響範囲**: PromptDA深度推定を使用した際の画像保存機能 +**ステータス**: 解決済み + +## 問題の症状 + +### 初期の問題 +1. **Float to UInt8変換時のクラッシュ** + - 対象物が近すぎる場合に`Thread 1: Fatal error: Float value cannot be converted to UInt8 because it is either infinite or NaN`エラーが発生 + - 深度値がNaNまたは無限大の場合の処理が不足 + +2. **保存画像の破損** + - TIFFファイルに斜めの線が入る + - PNG画像が完全に破損(ぐしゃぐしゃ) + - プレビューは正常に表示されるが、保存時のみ問題発生 + +### 調査で判明した詳細 +- PromptDAの出力: 252×182ピクセル(横長) +- ARKitの深度マップ: 192×256ピクセル(縦長) +- bytesPerRowにパディングが含まれる(例: 1024バイト vs 期待値1008バイト) + +## 調査過程 + +### Phase 1: NaN/Infinite値の処理 +最初に対処した問題は、深度値の変換時のクラッシュでした。 + +```swift +// 修正前 +let pixel = UInt8(normalizedDepth * 255.0) + +// 修正後 +let validDepth = depth.isNaN || depth.isInfinite ? 0.0 : depth +let normalizedDepth = min(max(validDepth / 5.0, 0.0), 1.0) +let pixel = UInt8(normalizedDepth * 255.0) +``` + +### Phase 2: 画像保存の破損調査 + +1. **最初の仮説**: 解像度の不一致 + - PromptDAとARKitで異なるサイズ + - リサイズ処理を追加 → 問題解決せず + +2. **二番目の仮説**: bytesPerRowの処理ミス + - パディングを考慮した処理に修正 + - プレビューでは正しく処理していたが、保存関数では不適切だった + +3. **三番目の仮説**: テンソル形状の解釈ミス + - [1, 182, 252]を[batch, width, height]として解釈 + - 転置処理を追加 → さらに悪化 + +## 根本原因 + +**プレビューと保存で異なるdepthMapを使用していた**ことが根本原因でした。 + +### 問題のあったコード構造 + +```swift +// プレビュー時 +if let promptDADepth = estimator.estimateDepth(...) { + latestDepthMap = promptDADepth // キャッシュに保存 + processDepthMap(promptDADepth) // 表示処理 +} + +// 保存時(問題のあった実装) +func saveDepthMap() { + if let promptDADepth = estimator.estimateDepth(...) { // 新たに生成! + depthMapToSave = promptDADepth + } +} +``` + +### 問題の詳細 +1. プレビュー時にPromptDAで生成したdepthMapを`latestDepthMap`に保存 +2. 保存時に**再度**PromptDAを実行して新しいdepthMapを生成 +3. 異なるタイミング・条件で生成されたため、データが不整合 + +## 解決方法 + +### 最終的な修正 + +```swift +func saveDepthMap() { + // Simply use the current latestDepthMap that was used for preview + guard let depthMap = latestDepthMap, let image = latestImage else { + print("Depth map or image is not available.") + return + } + // 以下、保存処理... +} +``` + +プレビューで使用した同じdepthMapを保存に使用することで、一貫性を確保しました。 + +### その他の改善 + +1. **16ビットPNG保存オプションの追加** + - TIFFに加えて、より汎用的なPNG形式での保存も実装 + - 深度範囲の自動正規化機能付き + +2. **bytesPerRow処理の統一** + ```swift + let floatsPerRow = bytesPerRow / MemoryLayout.size + let depth = floatBuffer?[y * floatsPerRow + x] + ``` + +3. **デバッグログの追加** + - 問題調査を容易にするための詳細なログ出力 + +## 技術的な学び + +### 1. CVPixelBufferのメモリレイアウト +- bytesPerRowは実際の画像幅より大きい場合がある(アライメントのため) +- 正しいインデックス計算が重要 + +### 2. 非同期処理とデータの一貫性 +- プレビューと保存で同じデータソースを使用することの重要性 +- キャッシュされたデータの適切な管理 + +### 3. デバッグの重要性 +- 段階的な問題の切り分け +- 仮説の検証と修正の繰り返し + +## 今後の改善提案 + +1. **エラーハンドリングの強化** + - 深度値の妥当性チェックの拡充 + - 保存失敗時のリトライ機構 + +2. **パフォーマンスの最適化** + - 不要な深度推定の実行を避ける + - メモリ使用量の最適化 + +3. **ユーザーエクスペリエンスの向上** + - 保存形式の選択UI + - 保存成功/失敗の詳細なフィードバック + +4. **コードの整理** + - 深度マップ処理を専用クラスに分離 + - テストの追加 + +## まとめ + +今回の問題は、一見複雑に見えましたが、根本原因は「プレビューと保存で異なるデータを使用していた」というシンプルなものでした。この経験から、以下の教訓を得ました: + +1. **データの一貫性**を常に意識する +2. **問題の切り分け**を段階的に行う +3. **デバッグログ**は問題解決の強力なツール +4. **シンプルな解決策**を最初に検討する + +この問題解決により、PromptDAを使用した深度推定機能が安定して動作するようになりました。 \ No newline at end of file