update,
This commit is contained in:
82
bf_helloworld_1.py
Normal file
82
bf_helloworld_1.py
Normal file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os,sys
|
||||
from pprint import pprint
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import json
|
||||
|
||||
output = []
|
||||
|
||||
def parse_tbl(page_no=1):
|
||||
main_url = f"https://www.mchk.org.hk/english/list_register/list.php?page={page_no}&ipp=20&type=L"
|
||||
req = requests.get(main_url)
|
||||
soup = BeautifulSoup(req.text, "html.parser")
|
||||
|
||||
title = soup.find("h1")
|
||||
|
||||
trs = soup.select("table#Table_5 table tr")
|
||||
|
||||
|
||||
doctor_info=[]
|
||||
|
||||
for tr in trs[2:]:
|
||||
try:
|
||||
# name
|
||||
tds = tr.select('td[headers="reg"]')
|
||||
|
||||
if (len(tds) > 0):
|
||||
# reg field got something
|
||||
reg = tds[0].get_text()
|
||||
|
||||
# # name
|
||||
tds = tr.select('td[headers="name"]')
|
||||
name = tds[0].get_text()
|
||||
|
||||
# address
|
||||
tds = tr.select('td[headers="date"]')
|
||||
address = tds[0].get_text()
|
||||
|
||||
# # qualification
|
||||
tds = tr.select('td[headers="qualification"]')
|
||||
qualification_0 = tds[0].get_text()
|
||||
qualification_1 = tds[1].get_text()
|
||||
|
||||
doctor_info = [
|
||||
reg,name, address, [
|
||||
[qualification_0, qualification_1]
|
||||
]
|
||||
]
|
||||
|
||||
output.append(doctor_info)
|
||||
doctor_info = []
|
||||
pass
|
||||
else:
|
||||
# reg field got nothing
|
||||
tds = tr.select('td[headers="qualification"]')
|
||||
for i in range(0, len(tds), 2):
|
||||
# pprint(f'{tds[i].get_text()}, {tds[i+1].get_text()}')
|
||||
qualification_0 = tds[i].get_text()
|
||||
qualification_1 = tds[i+1].get_text()
|
||||
qualifications = [qualification_0, qualification_1]
|
||||
# get last doctor
|
||||
last_doctor_idx = len(output)-1
|
||||
output[last_doctor_idx][3].append(qualifications)
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
print(tr)
|
||||
print('error')
|
||||
pass
|
||||
|
||||
for i in range(1,767+1):
|
||||
try:
|
||||
print(f'parsing {i}...')
|
||||
parse_tbl(i)
|
||||
except Exception as e:
|
||||
print(i)
|
||||
pass
|
||||
|
||||
with open ('output.json','a+', encoding='utf8') as fo:
|
||||
fo.truncate(0)
|
||||
json.dump(output, fo, ensure_ascii=False)
|
Reference in New Issue
Block a user