This commit is contained in:
louiscklaw
2025-01-31 22:36:21 +08:00
parent f4fc0b8f71
commit c7fb335275
1399 changed files with 5714 additions and 0 deletions

View File

@@ -0,0 +1,85 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are beach cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"dinosaur",
"dinosaur over the world",
"dinosaur in hong kong",
"dinosaur in china",
"dinosaur in england",
"dinosaur in us",
"dinosaur in australia",
"dinosaur in brazil",
"dinosaur in india",
"dinosaur in japan",
"dinosaur in russia",
"dinosaur in south africa",
"dinosaur in argentina",
"dinosaur in mexico",
"dinosaur in italy",
"dinosaur in france",
"dinosaur in spain",
"dinosaur in germany",
"dinosaur in thailand",
"dinosaur in vietnam",
"dinosaur in indonesia",
"dinosaur in philippines",
"dinosaur in malaysia",
"dinosaur in singapore",
"dinosaur in egypt",
"dinosaur in turkey",
"dinosaur in greece",
"dinosaur in portugal",
"dinosaur in netherlands",
"dinosaur in belgium",
"dinosaur in sweden",
"dinosaur in norway",
"dinosaur in denmark",
"dinosaur in finland",
"dinosaur in poland",
"dinosaur in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/dinosaur/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/dinosaur/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/dinosaur/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)