#!/usr/bin/env python import os,sys from pprint import pprint from bs4 import BeautifulSoup import requests import json output = [] def parse_tbl(page_no=1): main_url = f"https://www.mchk.org.hk/english/list_register/list.php?page={page_no}&ipp=20&type=L" req = requests.get(main_url) soup = BeautifulSoup(req.text, "html.parser") title = soup.find("h1") trs = soup.select("table#Table_5 table tr") doctor_info=[] for tr in trs[2:]: try: # name tds = tr.select('td[headers="reg"]') if (len(tds) > 0): # reg field got something reg = tds[0].get_text() # # name tds = tr.select('td[headers="name"]') name = tds[0].get_text() # address tds = tr.select('td[headers="date"]') address = tds[0].get_text() # # qualification tds = tr.select('td[headers="qualification"]') qualification_0 = tds[0].get_text() qualification_1 = tds[1].get_text() doctor_info = [ reg,name, address, [ [qualification_0, qualification_1] ] ] output.append(doctor_info) doctor_info = [] pass else: # reg field got nothing tds = tr.select('td[headers="qualification"]') for i in range(0, len(tds), 2): # pprint(f'{tds[i].get_text()}, {tds[i+1].get_text()}') qualification_0 = tds[i].get_text() qualification_1 = tds[i+1].get_text() qualifications = [qualification_0, qualification_1] # get last doctor last_doctor_idx = len(output)-1 output[last_doctor_idx][3].append(qualifications) pass except Exception as e: print(tr) print('error') pass for i in range(1,767+1): try: print(f'parsing {i}...') parse_tbl(i) except Exception as e: print(i) pass with open ('output.json','a+', encoding='utf8') as fo: fo.truncate(0) json.dump(output, fo, ensure_ascii=False)