基于python的图片去重清洗方案，专门针对相机连续采集的大量重复 / 相似图片

张

张建站

2026/7/15 6:01:36

10分钟阅读

基于python的图片去重清洗方案，专门针对相机连续采集的大量重复 / 相似图片

目录一、使用python的图片去重清洗方案专门针对相机连续采集的大量重复 / 相似图片二、对图像进行命名排序一、使用python的图片去重清洗方案专门针对相机连续采集的大量重复 / 相似图片读取一个文件夹内所有图片按文件名排序适合连续采图相邻帧去重保留“代表帧”自动输出保留图片到新文件夹重复图片到另一个文件夹可选日志 CSVimport os import cv2 import shutil import imagehash import numpy as np import pandas as pd from PIL import Image from pathlib import Path from skimage.metrics import structural_similarity as ssim # # 1. 配置参数 # INPUT_DIR rD:\your_images # 原始图片文件夹 OUTPUT_KEEP_DIR rD:\your_images_keep # 保留图片输出目录 OUTPUT_DUP_DIR rD:\your_images_dup # 重复图片输出目录可选 PHASH_THRESH 8 # pHash 哈明距离阈值越小越严格 SSIM_THRESH 0.96 # SSIM 阈值越大越严格 RESIZE_FOR_COMPARE (256, 256) # 比较时统一缩放加快速度 SAVE_DUPLICATES True # 是否保存重复图 LOG_CSV dedup_result.csv # 去重日志 # # 2. 获取图片文件列表 # def get_image_files(folder): 获取文件夹内所有图片路径并按文件名排序适合相机连续采集场景 exts {.jpg, .jpeg, .png, .bmp, .tif, .tiff} files [p for p in Path(folder).iterdir() if p.suffix.lower() in exts] files sorted(files, keylambda x: x.name) return files # # 3. 计算 pHash # def compute_phash(image_path): 计算图片的感知哈希pHash img Image.open(image_path).convert(RGB) return imagehash.phash(img) # # 4. 读取灰度图并缩放 # def load_gray_resized(image_path, size(256, 256)): 读取图片 - 转灰度 - resize 用于 SSIM 相似度比较 img cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE) if img is None: return None img cv2.resize(img, size, interpolationcv2.INTER_AREA) return img # # 5. 计算两张图的 SSIM # def compute_ssim(img1, img2): 计算两张灰度图的 SSIM 相似度 return ssim(img1, img2) # # 6. 判断两张图是否重复 # def is_duplicate(curr_path, ref_hash, ref_img, phash_thresh8, ssim_thresh0.96): 判断当前图是否与参考图重复规则 1. 先比较 pHash 2. 若 pHash 很接近再比较 SSIM 3. 满足条件则判为重复返回 is_dup: 是否重复 hash_dist: pHash距离 ssim_score: SSIM分数 curr_hash: 当前图哈希 curr_img: 当前图灰度图 curr_hash compute_phash(curr_path) hash_dist ref_hash - curr_hash curr_img load_gray_resized(curr_path, RESIZE_FOR_COMPARE) if curr_img is None: return False, hash_dist, None, curr_hash, curr_img # 第一关pHash 粗筛 if hash_dist phash_thresh: return False, hash_dist, None, curr_hash, curr_img # 第二关SSIM 精筛 ssim_score compute_ssim(ref_img, curr_img) is_dup ssim_score ssim_thresh return is_dup, hash_dist, ssim_score, curr_hash, curr_img # # 7. 复制文件 # def copy_file(src, dst_folder): 将图片复制到指定目录 os.makedirs(dst_folder, exist_okTrue) shutil.copy2(str(src), str(Path(dst_folder) / src.name)) # # 8. 主去重流程 # def deduplicate_images(): 主流程 1. 获取图片列表 2. 第一张直接保留 3. 后续每张图只和“上一次保留图”比较 4. 重复图丢弃/单独保存 5. 输出日志 image_files get_image_files(INPUT_DIR) if len(image_files) 0: print(❌ 输入文件夹中没有图片) return os.makedirs(OUTPUT_KEEP_DIR, exist_okTrue) if SAVE_DUPLICATES: os.makedirs(OUTPUT_DUP_DIR, exist_okTrue) logs [] # ----------------------------------------------------- # 第一张图直接保留 # ----------------------------------------------------- ref_path image_files[0] ref_hash compute_phash(ref_path) ref_img load_gray_resized(ref_path, RESIZE_FOR_COMPARE) copy_file(ref_path, OUTPUT_KEEP_DIR) logs.append({ file: ref_path.name, status: keep_first, phash_dist: 0, ssim: 1.0 }) keep_count 1 dup_count 0 print(f[KEEP] {ref_path.name} (首张保留)) # ----------------------------------------------------- # 从第二张开始逐张比较 # ----------------------------------------------------- for i in range(1, len(image_files)): curr_path image_files[i] try: is_dup, hash_dist, ssim_score, curr_hash, curr_img is_duplicate( curr_path, ref_hash, ref_img, phash_threshPHASH_THRESH, ssim_threshSSIM_THRESH ) if is_dup: dup_count 1 if SAVE_DUPLICATES: copy_file(curr_path, OUTPUT_DUP_DIR) logs.append({ file: curr_path.name, status: duplicate, phash_dist: hash_dist, ssim: ssim_score }) print(f[DUP ] {curr_path.name} | pHash{hash_dist} | SSIM{ssim_score:.4f}) else: keep_count 1 copy_file(curr_path, OUTPUT_KEEP_DIR) logs.append({ file: curr_path.name, status: keep, phash_dist: hash_dist, ssim: ssim_score if ssim_score is not None else -1 }) print(f[KEEP] {curr_path.name} | pHash{hash_dist} | SSIM{ssim_score}) # 更新参考图最新保留图 ref_path curr_path ref_hash curr_hash ref_img curr_img except Exception as e: logs.append({ file: curr_path.name, status: ferror: {e}, phash_dist: -1, ssim: -1 }) print(f[ERR ] {curr_path.name} - {e}) # ----------------------------------------------------- # 保存日志 # ----------------------------------------------------- df pd.DataFrame(logs) df.to_csv(LOG_CSV, indexFalse, encodingutf-8-sig) print(\n) print(f总图片数: {len(image_files)}) print(f保留数量: {keep_count}) print(f重复数量: {dup_count}) print(f保留目录: {OUTPUT_KEEP_DIR}) if SAVE_DUPLICATES: print(f重复目录: {OUTPUT_DUP_DIR}) print(f日志文件: {LOG_CSV}) print() # # 9. 程序入口 # if __name__ __main__: deduplicate_images()1、情况 A删得太狠误删比如目标轻微移动就被删掉你觉得本来应该保留的图没了那就调严格一点PHASH_THRESH 6 SSIM_THRESH 0.98意思pHash 更严格SSIM 更严格更不容易删图2、情况 B删得不够保留太多重复图比如很多几乎一样的图还保留着那就调宽松一点PHASH_THRESH 10 SSIM_THRESH 0.94 更容易判重复二、对图像进行命名排序import os import re from pathlib import Path # # 1. 参数配置 # rootdir rD:\Cybory_Data\Drinks\0401\imges_clean_1 # 图片文件夹 start_index 0 # 起始编号 digits 4 # 位数4 - 00005 - 00000 suffix .jpg # 重命名后的后缀 sort_mode natural # 可选: natural 或 time # # 2. 自然排序函数 # def natural_key(s): 自然排序键函数 return [int(text) if text.isdigit() else text.lower() for text in re.split(r(\d), s)] def get_sorted_files(folder, sort_modenatural): 获取文件夹内所有文件并按指定方式排序参数: folder: 文件夹路径 sort_mode: natural - 按文件名自然排序 time - 按文件修改时间排序 files [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))] if sort_mode natural: files.sort(keynatural_key) elif sort_mode time: files.sort(keylambda x: os.path.getmtime(os.path.join(folder, x))) else: raise ValueError(sort_mode 只能是 natural 或 time) return files # # 3. 重命名函数 # def rename_files_with_padding(folder, start_index0, digits4, suffix.jpg, sort_modenatural): 将文件夹内图片重命名为固定宽度编号格式 folder Path(folder) files get_sorted_files(folder, sort_modesort_mode) if len(files) 0: print(❌ 文件夹内没有文件) return print(f原始文件顺序排序方式: {sort_mode}) for i, f in enumerate(files): print(f{i:04d}: {f}) # 第一步临时重命名 temp_files [] for i, old_name in enumerate(files): old_path folder / old_name temp_name f__temp__{i}{old_path.suffix} temp_path folder / temp_name os.rename(old_path, temp_path) temp_files.append(temp_name) # 第二步正式重命名 print(\n重命名结果) for i, temp_name in enumerate(temp_files): temp_path folder / temp_name new_index start_index i new_name f{str(new_index).zfill(digits)}{suffix} new_path folder / new_name os.rename(temp_path, new_path) print(f{temp_name} -- {new_name}) print(\n✅ 重命名完成) # # 4. 主程序 # if __name__ __main__: rename_files_with_padding( folderrootdir, start_indexstart_index, digitsdigits, suffixsuffix, sort_modesort_mode )

AssetStudio：Unity游戏资源全流程提取工具专业指南

AssetStudio：Unity游戏资源全流程提取工具专业指南【免费下载链接】AssetStudio AssetStudio - Based on the archived Perfares AssetStudio, I continue Perfares work to keep AssetStudio up-to-date, with support for new Unity versions and additional imp…...

2026/7/11 7:52:12 阅读更多 →

Cursor AI Pro免费解锁完整指南：如何简单快速绕过限制实现终身高级功能

Cursor AI Pro免费解锁完整指南：如何简单快速绕过限制实现终身高级功能【免费下载链接】cursor-free-vip [Support 0.45]（Multi Language 多语言）自动注册 Cursor Ai ，自动重置机器ID ， 免费升级使用Pro 功能: Youve …...

2026/7/10 6:10:40 阅读更多 →

通义千问轻量模型部署避坑指南：解决只读文件系统等常见问题

通义千问轻量模型部署避坑指南：解决只读文件系统等常见问题 1. 部署前的准备工作在开始部署通义千问1.5-1.8B-Chat-GPTQ-Int4模型前，我们需要做好以下准备工作： 1.1 硬件与系统要求 GPU：至少4GB显存（NVIDIA显卡&a…...

2026/7/8 23:41:11 阅读更多 →

Go 微服务 API 版本管理：URL、Header 和 GraphQL 的演进策略

Go 微服务 API 版本管理：URL、Header 和 GraphQL 的演进策略一、改了 API 格式，App 没升级的用户全部崩溃移动端 App 的升级率是长期问题。API v1 发布半年后，仍有 15% 的用户在用 v1.0.0 版本。如果直接上线 v2 API 并下线 v1。这 15% 的…...

2026/7/14 7:50:03 阅读更多 →

一键解决DLL缺失问题：Visual C++运行库全家桶完整指南

一键解决DLL缺失问题：Visual C运行库全家桶完整指南【免费下载链接】vcredist AIO Repack for latest Microsoft Visual C Redistributable Runtimes 项目地址: https://gitcode.com/gh_mirrors/vc/vcredist 还在为"应用程序无法启动"、"缺少…...

2026/7/14 11:39:15 阅读更多 →