Files
004_comission/vinniesniper-54816/task1/_tools/crawer/crawler_dinosaur.py
louiscklaw c7fb335275 update,
2025-01-31 22:36:21 +08:00

86 lines
2.5 KiB
Python

from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are beach cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"dinosaur",
"dinosaur over the world",
"dinosaur in hong kong",
"dinosaur in china",
"dinosaur in england",
"dinosaur in us",
"dinosaur in australia",
"dinosaur in brazil",
"dinosaur in india",
"dinosaur in japan",
"dinosaur in russia",
"dinosaur in south africa",
"dinosaur in argentina",
"dinosaur in mexico",
"dinosaur in italy",
"dinosaur in france",
"dinosaur in spain",
"dinosaur in germany",
"dinosaur in thailand",
"dinosaur in vietnam",
"dinosaur in indonesia",
"dinosaur in philippines",
"dinosaur in malaysia",
"dinosaur in singapore",
"dinosaur in egypt",
"dinosaur in turkey",
"dinosaur in greece",
"dinosaur in portugal",
"dinosaur in netherlands",
"dinosaur in belgium",
"dinosaur in sweden",
"dinosaur in norway",
"dinosaur in denmark",
"dinosaur in finland",
"dinosaur in poland",
"dinosaur in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/dinosaur/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/dinosaur/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/dinosaur/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)