This commit is contained in:
louiscklaw
2025-01-31 22:36:21 +08:00
parent f4fc0b8f71
commit c7fb335275
1399 changed files with 5714 additions and 0 deletions

View File

@@ -0,0 +1,84 @@
from icrawler.builtin import BingImageCrawler
from icrawler.builtin import FlickrImageCrawler
from icrawler.builtin import BaiduImageCrawler
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GreedyImageCrawler
google = "google"
bing = "bing"
baidu = "baidu"
greedy = "greedy"
# we are building cats image detection that's why we put cat here
# if you want some other images then put that name in classes list
classes = [
"dish",
"dish over the world",
"dish in hong kong",
"dish in china",
"dish in england",
"dish in us",
"dish in australia",
"dish in brazil",
"dish in india",
"dish in japan",
"dish in russia",
"dish in south africa",
"dish in argentina",
"dish in mexico",
"dish in italy",
"dish in france",
"dish in spain",
"dish in germany",
"dish in thailand",
"dish in vietnam",
"dish in indonesia",
"dish in philippines",
"dish in malaysia",
"dish in singapore",
"dish in egypt",
"dish in turkey",
"dish in greece",
"dish in portugal",
"dish in netherlands",
"dish in belgium",
"dish in sweden",
"dish in norway",
"dish in denmark",
"dish in finland",
"dish in poland",
"dish in ukraine",
]
number = 99999
# here root directory is find your root directory there u will find
# new file name data in which all images are saved.
from multiprocessing import Pool
def crawler(s_c):
search_engine = s_c[0]
c = s_c[1]
c_dir = c.replace(" ", "_")
if search_engine == bing:
bing_crawler = BingImageCrawler(storage={"root_dir": f"p/dish/bing_{c_dir}"})
bing_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == google:
google_crawler = GoogleImageCrawler(storage={"root_dir": f"p/dish/google_{c_dir}"})
google_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
elif search_engine == baidu:
baidu_crawler = BaiduImageCrawler(storage={"root_dir": f"p/dish/baidu_{c_dir}"})
baidu_crawler.crawl(keyword=c, filters=None, max_num=number, offset=0)
process_list = []
for search_engine in [google, bing, baidu, greedy]:
for c in classes:
process_list.append([search_engine, c])
with Pool() as pool:
import random
random.shuffle(process_list)
pool.map(crawler, process_list)