This commit is contained in:
louiscklaw
2025-01-31 22:36:21 +08:00
parent f4fc0b8f71
commit c7fb335275
1399 changed files with 5714 additions and 0 deletions

View File

@@ -0,0 +1,41 @@
# iterate files over subdirectories in `00_store`, store image files in hash table, delete files with same hash
import os
import hashlib
import imghdr
deleted = 0
def check_same_image(file_path):
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()
def action(in_path):
for root, _, files in os.walk(in_path):
for file in files:
file_path = os.path.join(root, file)
if imghdr.what(file_path) is None:
print(file_path, "is not image")
os.remove(file_path)
continue
with open(file_path, "rb") as f:
contents = f.read()
if contents.startswith(b"\xff\xd8") and contents.endswith(b"\xff\xd9"):
# it's a jpeg
pass
else:
# print(file_path, "is not valid jpeg")
os.remove(file_path)
action("/home/logic/_workspace/task-list/servers/logic-NUC8i5BEH/opencv-workdesk/001_monitor/src/003-crawler-mountain/output")
action("/home/logic/_workspace/task-list/servers/logic-NUC8i5BEH/opencv-workdesk/001_monitor/src/003-crawler-mountain/output_mountain")
print("deleted:" + str(deleted))

View File

@@ -0,0 +1,89 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are beach cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"african people",
"african people over the world",
"african people in hong kong",
"african people in china",
"african people in england",
"african people in us",
"african people in australia",
"african people in brazil",
"african people in india",
"african people in japan",
"african people in russia",
"african people in south africa",
"african people in argentina",
"african people in mexico",
"african people in italy",
"african people in france",
"african people in spain",
"african people in germany",
"african people in thailand",
"african people in vietnam",
"african people in indonesia",
"african people in philippines",
"african people in malaysia",
"african people in singapore",
"african people in egypt",
"african people in turkey",
"african people in greece",
"african people in portugal",
"african people in netherlands",
"african people in belgium",
"african people in sweden",
"african people in norway",
"african people in denmark",
"african people in finland",
"african people in poland",
"african people in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/african/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/african/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/african/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [
google,
bing,
baidu,
]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)

View File

@@ -0,0 +1,85 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are beach cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"beach",
"beach over the world",
"beach in hong kong",
"beach in china",
"beach in england",
"beach in us",
"beach in australia",
"beach in brazil",
"beach in india",
"beach in japan",
"beach in russia",
"beach in south africa",
"beach in argentina",
"beach in mexico",
"beach in italy",
"beach in france",
"beach in spain",
"beach in germany",
"beach in thailand",
"beach in vietnam",
"beach in indonesia",
"beach in philippines",
"beach in malaysia",
"beach in singapore",
"beach in egypt",
"beach in turkey",
"beach in greece",
"beach in portugal",
"beach in netherlands",
"beach in belgium",
"beach in sweden",
"beach in norway",
"beach in denmark",
"beach in finland",
"beach in poland",
"beach in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/beach/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/beach/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/beach/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)

View File

@@ -0,0 +1,84 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are building cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"building",
"building over the world",
"building in hong kong",
"building in china",
"building in england",
"building in us",
"building in australia",
"building in brazil",
"building in india",
"building in japan",
"building in russia",
"building in south africa",
"building in argentina",
"building in mexico",
"building in italy",
"building in france",
"building in spain",
"building in germany",
"building in thailand",
"building in vietnam",
"building in indonesia",
"building in philippines",
"building in malaysia",
"building in singapore",
"building in egypt",
"building in turkey",
"building in greece",
"building in portugal",
"building in netherlands",
"building in belgium",
"building in sweden",
"building in norway",
"building in denmark",
"building in finland",
"building in poland",
"building in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/building/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/building/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/building/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)

View File

@@ -0,0 +1,85 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are beach cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"bus",
"bus over the world",
"bus in hong kong",
"bus in china",
"bus in england",
"bus in us",
"bus in australia",
"bus in brazil",
"bus in india",
"bus in japan",
"bus in russia",
"bus in south africa",
"bus in argentina",
"bus in mexico",
"bus in italy",
"bus in france",
"bus in spain",
"bus in germany",
"bus in thailand",
"bus in vietnam",
"bus in indonesia",
"bus in philippines",
"bus in malaysia",
"bus in singapore",
"bus in egypt",
"bus in turkey",
"bus in greece",
"bus in portugal",
"bus in netherlands",
"bus in belgium",
"bus in sweden",
"bus in norway",
"bus in denmark",
"bus in finland",
"bus in poland",
"bus in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/bus/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/bus/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/bus/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)

View File

@@ -0,0 +1,85 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are beach cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"dinosaur",
"dinosaur over the world",
"dinosaur in hong kong",
"dinosaur in china",
"dinosaur in england",
"dinosaur in us",
"dinosaur in australia",
"dinosaur in brazil",
"dinosaur in india",
"dinosaur in japan",
"dinosaur in russia",
"dinosaur in south africa",
"dinosaur in argentina",
"dinosaur in mexico",
"dinosaur in italy",
"dinosaur in france",
"dinosaur in spain",
"dinosaur in germany",
"dinosaur in thailand",
"dinosaur in vietnam",
"dinosaur in indonesia",
"dinosaur in philippines",
"dinosaur in malaysia",
"dinosaur in singapore",
"dinosaur in egypt",
"dinosaur in turkey",
"dinosaur in greece",
"dinosaur in portugal",
"dinosaur in netherlands",
"dinosaur in belgium",
"dinosaur in sweden",
"dinosaur in norway",
"dinosaur in denmark",
"dinosaur in finland",
"dinosaur in poland",
"dinosaur in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/dinosaur/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/dinosaur/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/dinosaur/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)

View File

@@ -0,0 +1,84 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are building cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"dish",
"dish over the world",
"dish in hong kong",
"dish in china",
"dish in england",
"dish in us",
"dish in australia",
"dish in brazil",
"dish in india",
"dish in japan",
"dish in russia",
"dish in south africa",
"dish in argentina",
"dish in mexico",
"dish in italy",
"dish in france",
"dish in spain",
"dish in germany",
"dish in thailand",
"dish in vietnam",
"dish in indonesia",
"dish in philippines",
"dish in malaysia",
"dish in singapore",
"dish in egypt",
"dish in turkey",
"dish in greece",
"dish in portugal",
"dish in netherlands",
"dish in belgium",
"dish in sweden",
"dish in norway",
"dish in denmark",
"dish in finland",
"dish in poland",
"dish in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/dish/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/dish/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/dish/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)

View File

@@ -0,0 +1,52 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
# we are building cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"elephant",
"elephant over the world",
"elephant in hong kong",
"elephant in china",
"elephant in england",
"elephant in us",
"elephant in australia",
"elephant in brazil",
"elephant in india",
"elephant in japan",
"elephant in russia",
"elephant in south africa",
"elephant in argentina",
"elephant in mexico",
"elephant in italy",
"elephant in france",
"elephant in spain",
"elephant in germany",
"elephant in thailand",
"elephant in vietnam",
"elephant in indonesia",
"elephant in philippines",
"elephant in malaysia",
"elephant in singapore",
"elephant in egypt",
"elephant in turkey",
"elephant in greece",
"elephant in portugal",
"elephant in netherlands",
"elephant in belgium",
"elephant in sweden",
"elephant in norway",
"elephant in denmark",
"elephant in finland",
"elephant in poland",
"elephant in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
for i in range(99):
for c in classes:
bing_crawler = BingImageCrawler(storage={"root_dir": f'p/elephant/{c.replace(" ","_")}'})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)

View File

@@ -0,0 +1,65 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
# we are building cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"elephant",
"elephant over the world",
"elephant in hong kong",
"elephant in china",
"elephant in england",
"elephant in us",
"elephant in australia",
"elephant in brazil",
"elephant in india",
"elephant in japan",
"elephant in russia",
"elephant in south africa",
"elephant in argentina",
"elephant in mexico",
"elephant in italy",
"elephant in france",
"elephant in spain",
"elephant in germany",
"elephant in thailand",
"elephant in vietnam",
"elephant in indonesia",
"elephant in philippines",
"elephant in malaysia",
"elephant in singapore",
"elephant in egypt",
"elephant in turkey",
"elephant in greece",
"elephant in portugal",
"elephant in netherlands",
"elephant in belgium",
"elephant in sweden",
"elephant in norway",
"elephant in denmark",
"elephant in finland",
"elephant in poland",
"elephant in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
for c in classes:
# bing_crawler = BingImageCrawler(storage={"root_dir": f'p/elephant/{c.replace(" ","_")}'})
# bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
# flickr_crawler = FlickrImageCrawler(storage={"root_dir": f"p/elephant/flickr"})
# flickr_crawler.crawl(keyword=c, max_num=10000, min_size=(200, 200), max_size=None)
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/elephant/baidu"})
baidu_crawler.crawl(keyword=c, max_num=10000, min_size=(200, 200), max_size=None)
for c in classes:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/elephant/google"})
google_crawler.crawl(keyword=c, max_num=10000, min_size=(200, 200), max_size=None)

View File

@@ -0,0 +1,85 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are building cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"elephant",
"elephant over the world",
"elephant in hong kong",
"elephant in china",
"elephant in england",
"elephant in us",
"elephant in australia",
"elephant in brazil",
"elephant in india",
"elephant in japan",
"elephant in russia",
"elephant in south africa",
"elephant in argentina",
"elephant in mexico",
"elephant in italy",
"elephant in france",
"elephant in spain",
"elephant in germany",
"elephant in thailand",
"elephant in vietnam",
"elephant in indonesia",
"elephant in philippines",
"elephant in malaysia",
"elephant in singapore",
"elephant in egypt",
"elephant in turkey",
"elephant in greece",
"elephant in portugal",
"elephant in netherlands",
"elephant in belgium",
"elephant in sweden",
"elephant in norway",
"elephant in denmark",
"elephant in finland",
"elephant in poland",
"elephant in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/elephant/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/elephant/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/elephant/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)

View File

@@ -0,0 +1,85 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are beach cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"flower",
"flower over the world",
"flower in hong kong",
"flower in china",
"flower in england",
"flower in us",
"flower in australia",
"flower in brazil",
"flower in india",
"flower in japan",
"flower in russia",
"flower in south africa",
"flower in argentina",
"flower in mexico",
"flower in italy",
"flower in france",
"flower in spain",
"flower in germany",
"flower in thailand",
"flower in vietnam",
"flower in indonesia",
"flower in philippines",
"flower in malaysia",
"flower in singapore",
"flower in egypt",
"flower in turkey",
"flower in greece",
"flower in portugal",
"flower in netherlands",
"flower in belgium",
"flower in sweden",
"flower in norway",
"flower in denmark",
"flower in finland",
"flower in poland",
"flower in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/flower/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/flower/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/flower/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)

View File

@@ -0,0 +1,86 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are beach cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"horse",
"horse over the world",
"horse in hong kong",
"horse in china",
"horse in england",
"horse in us",
"horse in australia",
"horse in brazil",
"horse in india",
"horse in japan",
"horse in russia",
"horse in south africa",
"horse in argentina",
"horse in mexico",
"horse in italy",
"horse in france",
"horse in spain",
"horse in germany",
"horse in thailand",
"horse in vietnam",
"horse in indonesia",
"horse in philippines",
"horse in malaysia",
"horse in singapore",
"horse in egypt",
"horse in turkey",
"horse in greece",
"horse in portugal",
"horse in netherlands",
"horse in belgium",
"horse in sweden",
"horse in norway",
"horse in denmark",
"horse in finland",
"horse in poland",
"horse in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/horse/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/horse/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/horse/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)

View File

@@ -0,0 +1,83 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are building cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"mountain",
"mountain over the world",
"mountain in hong kong",
"mountain in china",
"mountain in england",
"mountain in us",
"mountain in australia",
"mountain in brazil",
"mountain in india",
"mountain in japan",
"mountain in russia",
"mountain in south africa",
"mountain in argentina",
"mountain in mexico",
"mountain in italy",
"mountain in france",
"mountain in spain",
"mountain in germany",
"mountain in thailand",
"mountain in vietnam",
"mountain in indonesia",
"mountain in philippines",
"mountain in malaysia",
"mountain in singapore",
"mountain in egypt",
"mountain in turkey",
"mountain in greece",
"mountain in portugal",
"mountain in netherlands",
"mountain in belgium",
"mountain in sweden",
"mountain in norway",
"mountain in denmark",
"mountain in finland",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/mountain/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/mountain/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/mountain/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)

View File

@@ -0,0 +1,9 @@
from icrawler.builtin import BingImageCrawler
classes = ["trees", "roads", "Human faces"]
number = 100
for c in classes:
# see n is represent negaive images
bing_crawler = BingImageCrawler(storage={"root_dir": f'n/{c.replace(" ","_")}'})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)

View File

@@ -0,0 +1,54 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are beach cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"patient bed",
"patient bed in hong kong",
"patient bed in china",
"patient bed in japan",
"patient bed in taiwan",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/bus/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/bus/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/bus/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)

View File

@@ -0,0 +1,30 @@
# iterate files over subdirectories in `00_store`, store image files in hash table, delete files with same hash
import os
import hashlib
def check_same_image(file_path):
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
for byte_block in iter(lambda: f.read(8192), b""):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()
def action(in_path):
img_hash = {}
for root, _, files in os.walk(in_path):
for file in files:
file_path = os.path.join(root, file)
md5 = check_same_image(file_path)
if md5 in img_hash:
os.remove(file_path)
print(file_path, "found")
else:
img_hash[md5] = file_path
action("/home/logic/_workspace/task-list/servers/logic-NUC8i5BEH/opencv-workdesk/001_monitor/src/003-crawler-mountain/output")
action("/home/logic/_workspace/task-list/servers/logic-NUC8i5BEH/opencv-workdesk/001_monitor/src/003-crawler-mountain/output_mountain")

View File

@@ -0,0 +1,105 @@
import glob
import os
import shutil
import cv2
def merge_images(input_folder, output_folder):
if not os.path.exists(output_folder):
os.mkdir(output_folder)
count = 0
for root, _, filenames in os.walk(input_folder):
for fn in filenames:
if fn.endswith('.jpg'):
count += 1
shutil.copy(os.path.join(root, fn), os.path.join(output_folder, 'c_{:010d}.jpg'.format(count)))
if __name__ == '__main__':
merge_images('p/beach', 'output')
for f in glob.glob(os.path.join('/home/logic/test/data/1xx_Beach', 'c_*.jpg')):
os.remove(f)
for f in glob.glob('output/*.jpg'):
img = cv2.imread(f)
if img is None:
continue
shutil.copy(f, '/home/logic/test/data/1xx_Beach')
for f in glob.glob('output/*.jpg'):
os.remove(f)
print('beach done')
merge_images('p/building', 'output')
for f in glob.glob('output/*.jpg'):
img = cv2.imread(f)
if img is None:
continue
shutil.copy(f, '/home/logic/test/data/2xx_Building')
for f in glob.glob('output/*.jpg'):
os.remove(f)
print('building done')
merge_images('p/bus', 'output')
for f in glob.glob('output/*.jpg'):
img = cv2.imread(f)
if img is None:
continue
shutil.copy(f, '/home/logic/test/data/3xx_Bus')
for f in glob.glob('output/*.jpg'):
os.remove(f)
print('bus done')
merge_images('p/dinosaur', 'output')
for f in glob.glob('output/*.jpg'):
img = cv2.imread(f)
if img is None:
continue
shutil.copy(f, '/home/logic/test/data/4xx_Dinosaur')
for f in glob.glob('output/*.jpg'):
os.remove(f)
print('dinosaur done')
merge_images('p/elephant', 'output')
for f in glob.glob('output/*.jpg'):
img = cv2.imread(f)
if img is None:
continue
shutil.copy(f, '/home/logic/test/data/5xx_Elephant')
for f in glob.glob('output/*.jpg'):
os.remove(f)
print('elephant done')
merge_images('p/horse', 'output')
for f in glob.glob('output/*.jpg'):
img = cv2.imread(f)
if img is None:
continue
shutil.copy(f, '/home/logic/test/data/7xx_Horse')
for f in glob.glob('output/*.jpg'):
os.remove(f)
print('horse done')
merge_images('p/mountain', 'output')
for f in glob.glob('output/*.jpg'):
img = cv2.imread(f)
if img is None:
continue
shutil.copy(f, '/home/logic/test/data/8xx_Mountain')
for f in glob.glob('output/*.jpg'):
os.remove(f)
print('mountain done')
merge_images('p/dish', 'output')
for f in glob.glob('output/*.jpg'):
img = cv2.imread(f)
if img is None:
continue
shutil.copy(f, '/home/logic/test/data/9xx_Dish')
for f in glob.glob('output/*.jpg'):
os.remove(f)
print('dish done')
print('done')

View File

@@ -0,0 +1,114 @@
import glob
import os
import shutil
import cv2
import random
import string
beach_src_path = "p/beach"
beach_target_path = "/home/logic/_wsl_workspace/comission-playlist-2024/vinniesniper-54816/src/data/1xx_Beach"
building_src_path = "p/building"
building_target_path = "/home/logic/_wsl_workspace/comission-playlist-2024/vinniesniper-54816/src/data/2xx_Building"
bus_src_path = "p/bus"
bus_target_path = "/home/logic/_wsl_workspace/comission-playlist-2024/vinniesniper-54816/src/data/3xx_Bus"
dinosaur_src_path = "p/dinosaur"
dinosaur_target_path = "/home/logic/_wsl_workspace/comission-playlist-2024/vinniesniper-54816/src/data/4xx_Dinosaur"
elephant_src_path = "p/elephant"
elephant_target_path = "/home/logic/_wsl_workspace/comission-playlist-2024/vinniesniper-54816/src/data/5xx_Elephant"
flower_src_path = "p/flower"
flower_target_path = "/home/logic/_wsl_workspace/comission-playlist-2024/vinniesniper-54816/src/data/6xx_Flower"
horse_src_path = "p/horse"
horse_target_path = "/home/logic/_wsl_workspace/comission-playlist-2024/vinniesniper-54816/src/data/7xx_Horse"
mountain_src_path = "p/mountain"
mountain_target_path = "/home/logic/_wsl_workspace/comission-playlist-2024/vinniesniper-54816/src/data/8xx_Mountain"
dish_src_path = "p/dish"
dish_target_path = "/home/logic/_wsl_workspace/comission-playlist-2024/vinniesniper-54816/src/data/9xx_Dish"
def merge_images(input_folder, output_folder):
if not os.path.exists(output_folder):
os.mkdir(output_folder)
count = 0
for root, _, filenames in os.walk(input_folder):
for fn in list(sorted(filenames)):
if (count < 10000):
if os.path.getsize(os.path.join(root, fn)) == 0:
os.remove(os.path.join(root, fn))
continue
img = cv2.imread(os.path.join(root, fn))
if img is None:
os.remove(os.path.join(root, fn))
continue
if fn.endswith('.jpg'):
count += 1
shutil.copy(os.path.join(root, fn), os.path.join(output_folder, 'c_{:010d}.jpg'.format(count)))
def copy_img(src_path, target_path):
random_str = ''.join(random.choices(string.ascii_letters + string.digits, k=5))
tmp_dir = '_tmp/_tmp_' + random_str
os.mkdir(tmp_dir)
for f in glob.glob(os.path.join(target_path, 'c_*.jpg')):
os.remove(f)
merge_images(src_path, tmp_dir)
for f in sorted(glob.glob(tmp_dir+'/*.jpg')):
shutil.copy(f, target_path)
shutil.rmtree(tmp_dir)
print(' img add done' )
from concurrent.futures import ThreadPoolExecutor
def add_img(src_path, target_path):
print('add to ' + target_path, end="")
copy_img(src_path, target_path)
with ThreadPoolExecutor() as executor:
executor.submit(add_img, beach_src_path, beach_target_path)
executor.submit(add_img, building_src_path, building_target_path)
executor.submit(add_img, bus_src_path, bus_target_path)
executor.submit(add_img, dinosaur_src_path, dinosaur_target_path)
executor.submit(add_img, elephant_src_path, elephant_target_path)
executor.submit(add_img, flower_src_path, flower_target_path)
executor.submit(add_img, horse_src_path, horse_target_path)
executor.submit(add_img, mountain_src_path, mountain_target_path)
executor.submit(add_img, dish_src_path, dish_target_path)
# print('add to bus', end="")
# copy_img(bus_src_path, bus_target_path)
# print('add to dinosaur', end="")
# copy_img(dinosaur_src_path, dinosaur_target_path)
# print('add to elephant', end="")
# copy_img(elephant_src_path, elephant_target_path)
# print('add to flower', end="")
# copy_img(flower_src_path, flower_target_path)
# print('add to horse', end="")
# copy_img(horse_src_path, horse_target_path)
# print('add to mountain', end="")
# copy_img(mountain_src_path, mountain_target_path)
# print('add to dish', end="")
# copy_img(dish_src_path, dish_target_path)
print('done')

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env bash
set -e
python ./merge.py p/mountain
cp /home/logic/_wsl_workspace/comission-playlist-2024/vinniesniper-54816-src/task1/_lab/003-crawler-mountain/output/*.jpg \
/home/logic/test/data/8xx_Mountain
echo "done"

View File

@@ -0,0 +1,5 @@
#!/usr/bin/env bash
set -ex
ps -ef|grep -i crawler|grep -v grep|awk '{print $2}'|xargs kill -9

View File

@@ -0,0 +1,19 @@
import os
import shutil
import sys
def merge_images(input_folder, output_folder):
if not os.path.exists(output_folder):
os.mkdir(output_folder)
count = 0
for root, _, filenames in os.walk(input_folder):
for fn in filenames:
if fn.endswith('.jpg'):
count += 1
shutil.copy(os.path.join(root, fn), os.path.join(output_folder, 'c_{:010d}.jpg'.format(count)))
if __name__ == '__main__':
for f in os.listdir('output_mountain'):
if f.endswith('.jpg'):
os.remove(os.path.join('output_mountain', f))
merge_images("/home/logic/_workspace/task-list/servers/logic-NUC8i5BEH/opencv-workdesk/001_monitor/src/003-crawler-mountain/p/mountain", 'output_mountain')

View File

@@ -0,0 +1,140 @@
import os
import shutil
import sys
from multiprocessing import Pool
import imghdr
import hashlib
from PIL import Image
base_dir = (
"/home/logic/_workspace/task-list/servers/logic-NUC8i5BEH/opencv-workdesk/001_monitor/src/003-crawler-mountain/p"
)
def check_same_image(file_path):
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()
def p_check_same_image(in_path):
for root, _, files in os.walk(in_path):
for file in files:
file_path = os.path.join(root, file)
if imghdr.what(file_path) is None:
print(file_path, "is not image")
os.remove(file_path)
continue
with open(file_path, "rb") as f:
contents = f.read()
if contents.startswith(b"\xff\xd8") and contents.endswith(b"\xff\xd9"):
# it's a jpeg
pass
else:
# print(file_path, "is not valid jpeg")
os.remove(file_path)
def resize_image(file_path, max_size=1920):
if os.path.getsize(file_path) == 0:
os.remove(file_path)
print(f"Deleted empty file: {file_path}")
return
with Image.open(file_path) as img:
# Resize the image if width or height larger than 1024
if img.width > max_size or img.height > max_size:
# Calculate the new size maintaining the aspect ratio
aspect_ratio = img.width / img.height
if img.width > img.height:
new_width = min(img.width, max_size)
new_height = int(new_width / aspect_ratio)
else:
new_height = min(img.height, max_size)
new_width = int(new_height * aspect_ratio)
# Resize the image
resized_img = img.resize((new_width, new_height))
# Save the resized image
resized_img.save(file_path)
print("resize done " + file_path)
else:
# skipping
# print("skipped " + file_path)
return
def resize_image_worker(file_path):
try:
resize_image(file_path)
except Exception as e:
print(file_path)
print(e)
def p_resize_image(in_path):
from multiprocessing import Pool
for root, _, files in os.walk(in_path):
for file in files:
resize_image_worker(os.path.join(root, file))
def merge_to_output(input_folder, output_folder):
if not os.path.exists(output_folder):
os.mkdir(output_folder)
count = 0
for root, _, filenames in os.walk(input_folder):
for fn in filenames:
if fn.endswith(".jpg"):
count += 1
shutil.copy(os.path.join(root, fn), os.path.join(output_folder, "c_{:010d}.jpg".format(count)))
def process_dir(input_folder, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for f in os.listdir(output_dir):
if f.endswith(".jpg"):
os.remove(os.path.join(output_dir, f))
merge_to_output(input_folder, output_dir)
p_check_same_image(output_dir)
p_resize_image(output_dir)
def process_dir_mp(args):
print("process_dir", *args)
process_dir(*args)
if __name__ == "__main__":
with Pool(processes=2) as p:
p.map(
process_dir_mp,
[
(f"{base_dir}/{t_dir}", f"output_{t_dir}")
for t_dir in [
# "building",
# "african",
# "beach",
# "bus",
# "dinosaur",
# "dish",
"elephant",
# "horse",
# "flower",
# "mountain",
]
],
)
print("done")

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More