Files
004_comission/vinniesniper-54816/task1/_tools/crawer/crawler_beach.py
louiscklaw c7fb335275 update,
2025-01-31 22:36:21 +08:00

86 lines
2.4 KiB
Python

from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are beach cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"beach",
"beach over the world",
"beach in hong kong",
"beach in china",
"beach in england",
"beach in us",
"beach in australia",
"beach in brazil",
"beach in india",
"beach in japan",
"beach in russia",
"beach in south africa",
"beach in argentina",
"beach in mexico",
"beach in italy",
"beach in france",
"beach in spain",
"beach in germany",
"beach in thailand",
"beach in vietnam",
"beach in indonesia",
"beach in philippines",
"beach in malaysia",
"beach in singapore",
"beach in egypt",
"beach in turkey",
"beach in greece",
"beach in portugal",
"beach in netherlands",
"beach in belgium",
"beach in sweden",
"beach in norway",
"beach in denmark",
"beach in finland",
"beach in poland",
"beach in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/beach/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/beach/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/beach/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)