pip install -q -U transformers

from transformers import pipeline
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
checkpoint = "depth-anything/Depth-Anything-V2-base-hf"
pipe = pipeline("depth-estimation", model=checkpoint, device=device)

from PIL import Image
import requests

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
image = Image.open(requests.get(url, stream=True).raw)
image

predictions = pipe(image)

predictions["depth"]

from transformers import AutoImageProcessor, AutoModelForDepthEstimation

checkpoint = "Intel/zoedepth-nyu-kitti"

image_processor = AutoImageProcessor.from_pretrained(checkpoint)
model = AutoModelForDepthEstimation.from_pretrained(checkpoint).to(device)

pixel_values = image_processor(image, return_tensors="pt").pixel_values.to(device)

with torch.no_grad():
    outputs = model(pixel_values)

# ZoeDepth 动态填充输入图像。因此，我们在调用 `post_process_depth_estimation` 时传递原始图像的尺寸，以去除填充并调整到原始尺寸。
post_processed_output = image_processor.post_process_depth_estimation(
    outputs,
    source_sizes=[(image.height, image.width)],
)

predicted_depth = post_processed_output[0]["predicted_depth"]
depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min())
depth = depth.detach().cpu().numpy() * 255
depth = Image.fromarray(depth.astype("uint8"))

with torch.no_grad():
    outputs = model(pixel_values)
    outputs_flipped = model(pixel_values=torch.flip(inputs.pixel_values, dims=[3]))

post_processed_output = image_processor.post_process_depth_estimation(
    outputs,
    source_sizes=[(image.height, image.width)],
    outputs_flipped=outputs_flipped,
)

特征	单目深度估计	双目或多目深度估计
输入	单张图像	多张图像（通常从不同视角拍摄）
挑战	- 尺度模糊性：无法直接提供绝对深度信息 - 光照影响：光照变化影响估计准确性 - 遮挡和反射：遮挡物和反射表面干扰深度信息提取	- 设备复杂性：需要多个摄像头 - 计算复杂性：处理多张图像需要更多的计算资源
优势	- 设备简单：只需一个摄像头 - 适用范围广：适合资源受限的设备，如智能手机、无人机等	- 直接计算深度：通过视差直接计算深度，准确性高
工作原理	- 利用纹理、颜色、阴影、几何结构和上下文等视觉线索 - 通过学习和模型训练来估计深度	- 通过视差（不同视角下的位置差异）直接计算深度
应用	- 3D 重建 - 增强现实 - 自动驾驶辅助系统 - 机器人导航（资源受限环境）	- 自动驾驶汽车 - 机器人导航（高精度要求） - 工业检测和测量
典型模型	- Depth Anything V2 - ZoeDepth	- Stereo Matching - Structure from Motion (SfM)

单目与多目深度估计的区别¶

单目深度估计¶

深度估计管道¶

手动进行深度估计推断¶