# iterate files over subdirectories in `00_store`, store image files in hash table, delete files with same hash import os import hashlib import imghdr deleted = 0 def check_same_image(file_path): sha256_hash = hashlib.sha256() with open(file_path, "rb") as f: for byte_block in iter(lambda: f.read(4096), b""): sha256_hash.update(byte_block) return sha256_hash.hexdigest() def action(in_path): for root, _, files in os.walk(in_path): for file in files: file_path = os.path.join(root, file) if imghdr.what(file_path) is None: print(file_path, "is not image") os.remove(file_path) continue with open(file_path, "rb") as f: contents = f.read() if contents.startswith(b"\xff\xd8") and contents.endswith(b"\xff\xd9"): # it's a jpeg pass else: # print(file_path, "is not valid jpeg") os.remove(file_path) action("/home/logic/_workspace/task-list/servers/logic-NUC8i5BEH/opencv-workdesk/001_monitor/src/003-crawler-mountain/output") action("/home/logic/_workspace/task-list/servers/logic-NUC8i5BEH/opencv-workdesk/001_monitor/src/003-crawler-mountain/output_mountain") print("deleted:" + str(deleted))