Files
004_comission/vinniesniper-54816/task1/_tools/crawer/crawler_dish.py
louiscklaw c7fb335275 update,
2025-01-31 22:36:21 +08:00

85 lines
2.3 KiB
Python

from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are building cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"dish",
"dish over the world",
"dish in hong kong",
"dish in china",
"dish in england",
"dish in us",
"dish in australia",
"dish in brazil",
"dish in india",
"dish in japan",
"dish in russia",
"dish in south africa",
"dish in argentina",
"dish in mexico",
"dish in italy",
"dish in france",
"dish in spain",
"dish in germany",
"dish in thailand",
"dish in vietnam",
"dish in indonesia",
"dish in philippines",
"dish in malaysia",
"dish in singapore",
"dish in egypt",
"dish in turkey",
"dish in greece",
"dish in portugal",
"dish in netherlands",
"dish in belgium",
"dish in sweden",
"dish in norway",
"dish in denmark",
"dish in finland",
"dish in poland",
"dish in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/dish/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/dish/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/dish/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)