Files
004_comission/vinniesniper-54816/task1/_tools/crawer/crawler_building.py
louiscklaw c7fb335275 update,
2025-01-31 22:36:21 +08:00

85 lines
2.5 KiB
Python

from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are building cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"building",
"building over the world",
"building in hong kong",
"building in china",
"building in england",
"building in us",
"building in australia",
"building in brazil",
"building in india",
"building in japan",
"building in russia",
"building in south africa",
"building in argentina",
"building in mexico",
"building in italy",
"building in france",
"building in spain",
"building in germany",
"building in thailand",
"building in vietnam",
"building in indonesia",
"building in philippines",
"building in malaysia",
"building in singapore",
"building in egypt",
"building in turkey",
"building in greece",
"building in portugal",
"building in netherlands",
"building in belgium",
"building in sweden",
"building in norway",
"building in denmark",
"building in finland",
"building in poland",
"building in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/building/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/building/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/building/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)