Files
004_comission/vinniesniper-54816/task1/_tools/crawer/crawler_african.py
louiscklaw c7fb335275 update,
2025-01-31 22:36:21 +08:00

90 lines
2.7 KiB
Python

from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are beach cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"african people",
"african people over the world",
"african people in hong kong",
"african people in china",
"african people in england",
"african people in us",
"african people in australia",
"african people in brazil",
"african people in india",
"african people in japan",
"african people in russia",
"african people in south africa",
"african people in argentina",
"african people in mexico",
"african people in italy",
"african people in france",
"african people in spain",
"african people in germany",
"african people in thailand",
"african people in vietnam",
"african people in indonesia",
"african people in philippines",
"african people in malaysia",
"african people in singapore",
"african people in egypt",
"african people in turkey",
"african people in greece",
"african people in portugal",
"african people in netherlands",
"african people in belgium",
"african people in sweden",
"african people in norway",
"african people in denmark",
"african people in finland",
"african people in poland",
"african people in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/african/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/african/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/african/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [
google,
bing,
baidu,
]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)