图像检索:从 CBIR 到 CLIP
图像检索从 CBIR 到 CLIP1. 技术分析1.1 图像检索技术演进图像检索经历了从基于内容到深度学习的演进图像检索技术路线 CBIR: 基于内容的图像检索 CNN: 深度学习特征 CLIP: 跨模态检索1.2 检索方法对比方法类型准确率速度特点CBIR手工特征中快简单CNN深度学习高中特征学习CLIP跨模态很高中文本-图像1.3 图像检索流程图像检索流程 索引阶段: 提取特征 → 构建索引 查询阶段: 提取特征 → 相似度匹配 → 返回结果2. 核心功能实现2.1 CBIR 检索import cv2 import numpy as np from sklearn.metrics.pairwise import cosine_similarity class CBIRSystem: def __init__(self): self.features [] self.images [] def extract_features(self, image): gray cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) sift cv2.SIFT_create() _, descriptors sift.detectAndCompute(gray, None) if descriptors is not None: return descriptors.mean(axis0) else: return np.zeros(128) def index(self, images): self.images images self.features [self.extract_features(img) for img in images] def query(self, query_image, top_k5): query_features self.extract_features(query_image) similarities [] for i, features in enumerate(self.features): similarity cosine_similarity([query_features], [features])[0][0] similarities.append((i, similarity)) similarities.sort(keylambda x: x[1], reverseTrue) return [(self.images[i], similarity) for i, similarity in similarities[:top_k]]2.2 CNN 检索import torch import torch.nn as nn from torchvision import models class CNNRetrieval: def __init__(self, model_nameresnet50): self.model getattr(models, model_name)(pretrainedTrue) self.model nn.Sequential(*list(self.model.children())[:-1]) self.model.eval() self.features [] self.images [] def extract_features(self, image): image torch.tensor(image).permute(2, 0, 1).unsqueeze(0).float() with torch.no_grad(): features self.model(image) return features.squeeze().numpy() def index(self, images): self.images images self.features [self.extract_features(img) for img in images] def query(self, query_image, top_k5): query_features self.extract_features(query_image) similarities [] for i, features in enumerate(self.features): similarity np.dot(query_features, features) / (np.linalg.norm(query_features) * np.linalg.norm(features)) similarities.append((i, similarity)) similarities.sort(keylambda x: x[1], reverseTrue) return [(self.images[i], similarity) for i, similarity in similarities[:top_k]]2.3 CLIP 检索class CLIPRetrieval: def __init__(self): import clip self.device cuda if torch.cuda.is_available() else cpu self.model, self.preprocess clip.load(ViT-B/32, deviceself.device) self.image_features [] self.images [] def index(self, images): self.images images for image in images: image_tensor self.preprocess(image).unsqueeze(0).to(self.device) with torch.no_grad(): features self.model.encode_image(image_tensor) self.image_features.append(features.squeeze().cpu().numpy()) def query_image(self, query_image, top_k5): query_tensor self.preprocess(query_image).unsqueeze(0).to(self.device) with torch.no_grad(): query_features self.model.encode_image(query_tensor).cpu().numpy() similarities [] for i, features in enumerate(self.image_features): similarity np.dot(query_features[0], features) / (np.linalg.norm(query_features[0]) * np.linalg.norm(features)) similarities.append((i, similarity)) similarities.sort(keylambda x: x[1], reverseTrue) return [(self.images[i], similarity) for i, similarity in similarities[:top_k]] def query_text(self, text, top_k5): text_tokens clip.tokenize([text]).to(self.device) with torch.no_grad(): text_features self.model.encode_text(text_tokens).cpu().numpy() similarities [] for i, features in enumerate(self.image_features): similarity np.dot(text_features[0], features) / (np.linalg.norm(text_features[0]) * np.linalg.norm(features)) similarities.append((i, similarity)) similarities.sort(keylambda x: x[1], reverseTrue) return [(self.images[i], similarity) for i, similarity in similarities[:top_k]]3. 性能对比3.1 检索方法对比方法准确率(Top-1)索引时间(s)查询时间(ms)CBIR (SIFT)60%10100CNN (ResNet)85%6050CLIP95%120303.2 不同数据集表现数据集CBIRCNNCLIPOxford Flowers65%88%94%CIFAR-1070%92%96%ImageNet55%85%92%3.3 索引大小对比方法特征维度索引大小(GB)CBIR1280.1CNN20482.0CLIP5120.54. 最佳实践4.1 检索系统选择def select_retrieval_system(dataset_size, constraints): if dataset_size 1000: return CBIRSystem() elif constraints.get(text_query, False): return CLIPRetrieval() else: return CNNRetrieval() class RetrievalFactory: staticmethod def create(config): if config[type] cbir: return CBIRSystem() elif config[type] cnn: return CNNRetrieval(model_nameconfig.get(model_name, resnet50)) elif config[type] clip: return CLIPRetrieval()4.2 检索流程class ImageRetrievalPipeline: def __init__(self, retriever, indexerNone): self.retriever retriever self.indexer indexer def build_index(self, images): if self.indexer: self.indexer.build(images) self.retriever.index(images) def search(self, query, top_k5): if isinstance(query, str): return self.retriever.query_text(query, top_k) else: return self.retriever.query_image(query, top_k)5. 总结图像检索技术不断进步CBIR传统方法适合小规模数据集CNN深度学习方法效果好CLIP跨模态检索支持文本查询选择建议根据数据集大小和需求选择对比数据如下CLIP 在检索任务上表现最好CNN 是平衡效果和速度的好选择CBIR 适合快速原型开发推荐使用 CLIP 进行跨模态检索