update,
This commit is contained in:
41
vinniesniper-54816/task1/_tools/crawer/check_valid_image.py
Normal file
41
vinniesniper-54816/task1/_tools/crawer/check_valid_image.py
Normal file
@@ -0,0 +1,41 @@
|
||||
# iterate files over subdirectories in `00_store`, store image files in hash table, delete files with same hash
|
||||
import os
|
||||
import hashlib
|
||||
import imghdr
|
||||
|
||||
deleted = 0
|
||||
|
||||
|
||||
def check_same_image(file_path):
|
||||
sha256_hash = hashlib.sha256()
|
||||
with open(file_path, "rb") as f:
|
||||
for byte_block in iter(lambda: f.read(4096), b""):
|
||||
sha256_hash.update(byte_block)
|
||||
return sha256_hash.hexdigest()
|
||||
|
||||
|
||||
def action(in_path):
|
||||
for root, _, files in os.walk(in_path):
|
||||
for file in files:
|
||||
file_path = os.path.join(root, file)
|
||||
|
||||
if imghdr.what(file_path) is None:
|
||||
print(file_path, "is not image")
|
||||
os.remove(file_path)
|
||||
|
||||
continue
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
contents = f.read()
|
||||
if contents.startswith(b"\xff\xd8") and contents.endswith(b"\xff\xd9"):
|
||||
# it's a jpeg
|
||||
pass
|
||||
else:
|
||||
# print(file_path, "is not valid jpeg")
|
||||
os.remove(file_path)
|
||||
|
||||
|
||||
action("/home/logic/_workspace/task-list/servers/logic-NUC8i5BEH/opencv-workdesk/001_monitor/src/003-crawler-mountain/output")
|
||||
action("/home/logic/_workspace/task-list/servers/logic-NUC8i5BEH/opencv-workdesk/001_monitor/src/003-crawler-mountain/output_mountain")
|
||||
|
||||
print("deleted:" + str(deleted))
|
Reference in New Issue
Block a user