Files
004_comission/vinniesniper-54816/task1/_tools/crawer/crawler_bus.py
louiscklaw c7fb335275 update,
2025-01-31 22:36:21 +08:00

86 lines
2.3 KiB
Python

from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are beach cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"bus",
"bus over the world",
"bus in hong kong",
"bus in china",
"bus in england",
"bus in us",
"bus in australia",
"bus in brazil",
"bus in india",
"bus in japan",
"bus in russia",
"bus in south africa",
"bus in argentina",
"bus in mexico",
"bus in italy",
"bus in france",
"bus in spain",
"bus in germany",
"bus in thailand",
"bus in vietnam",
"bus in indonesia",
"bus in philippines",
"bus in malaysia",
"bus in singapore",
"bus in egypt",
"bus in turkey",
"bus in greece",
"bus in portugal",
"bus in netherlands",
"bus in belgium",
"bus in sweden",
"bus in norway",
"bus in denmark",
"bus in finland",
"bus in poland",
"bus in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/bus/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/bus/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/bus/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)