Files
louiscklaw fbf7c5da6c update,
2025-01-31 19:54:32 +08:00

83 lines
2.0 KiB
Python

#!/usr/bin/env python
import os,sys
from pprint import pprint
from bs4 import BeautifulSoup
import requests
import json
output = []
def parse_tbl(page_no=1):
main_url = f"https://www.mchk.org.hk/english/list_register/list.php?page={page_no}&ipp=20&type=L"
req = requests.get(main_url)
soup = BeautifulSoup(req.text, "html.parser")
title = soup.find("h1")
trs = soup.select("table#Table_5 table tr")
doctor_info=[]
for tr in trs[2:]:
try:
# name
tds = tr.select('td[headers="reg"]')
if (len(tds) > 0):
# reg field got something
reg = tds[0].get_text()
# # name
tds = tr.select('td[headers="name"]')
name = tds[0].get_text()
# address
tds = tr.select('td[headers="date"]')
address = tds[0].get_text()
# # qualification
tds = tr.select('td[headers="qualification"]')
qualification_0 = tds[0].get_text()
qualification_1 = tds[1].get_text()
doctor_info = [
reg,name, address, [
[qualification_0, qualification_1]
]
]
output.append(doctor_info)
doctor_info = []
pass
else:
# reg field got nothing
tds = tr.select('td[headers="qualification"]')
for i in range(0, len(tds), 2):
# pprint(f'{tds[i].get_text()}, {tds[i+1].get_text()}')
qualification_0 = tds[i].get_text()
qualification_1 = tds[i+1].get_text()
qualifications = [qualification_0, qualification_1]
# get last doctor
last_doctor_idx = len(output)-1
output[last_doctor_idx][3].append(qualifications)
pass
except Exception as e:
print(tr)
print('error')
pass
for i in range(1,767+1):
try:
print(f'parsing {i}...')
parse_tbl(i)
except Exception as e:
print(i)
pass
with open ('output.json','a+', encoding='utf8') as fo:
fo.truncate(0)
json.dump(output, fo, ensure_ascii=False)