From 358fd197d1accbfa78a866f535344523f9a9326c Mon Sep 17 00:00:00 2001 From: louiscklaw Date: Sat, 1 Feb 2025 02:02:07 +0800 Subject: [PATCH] update, --- Pipfile | 13 ++ Pipfile.lock | 157 +++++++++++++++++++ bf_helloworld.py | 57 +++++++ bf_helloworld_1-extract.py | 44 ++++++ bf_helloworld_1.py | 82 ++++++++++ bf_helloworld_old.py | 51 ++++++ docs/.gitkeep | 0 docs/Screenshot from 2023-07-31 23-28-33.png | 3 + gitUpdate.bat | 7 + history.md | 5 + journey.md | 15 ++ main.py | 82 ++++++++++ package.json | 13 ++ 13 files changed, 529 insertions(+) create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100644 bf_helloworld.py create mode 100644 bf_helloworld_1-extract.py create mode 100644 bf_helloworld_1.py create mode 100644 bf_helloworld_old.py create mode 100644 docs/.gitkeep create mode 100644 docs/Screenshot from 2023-07-31 23-28-33.png create mode 100644 gitUpdate.bat create mode 100644 history.md create mode 100644 journey.md create mode 100644 main.py create mode 100644 package.json diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..240f5fb --- /dev/null +++ b/Pipfile @@ -0,0 +1,13 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +bs4 = "*" +requests = "*" + +[dev-packages] + +[requires] +python_version = "3.11" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..1ab14f6 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,157 @@ +{ + "_meta": { + "hash": { + "sha256": "e1f9e020d2828a92340419cca07ebee6f3d71b0262cbb9458ef7d65da17746d6" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.11" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "beautifulsoup4": { + "hashes": [ + "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da", + "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==4.12.2" + }, + "bs4": { + "hashes": [ + "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" + ], + "index": "pypi", + "version": "==0.0.1" + }, + "certifi": { + "hashes": [ + "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082", + "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9" + ], + "markers": "python_version >= '3.6'", + "version": "==2023.7.22" + }, + "charset-normalizer": { + "hashes": [ + "sha256:04e57ab9fbf9607b77f7d057974694b4f6b142da9ed4a199859d9d4d5c63fe96", + "sha256:09393e1b2a9461950b1c9a45d5fd251dc7c6f228acab64da1c9c0165d9c7765c", + "sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710", + "sha256:1000fba1057b92a65daec275aec30586c3de2401ccdcd41f8a5c1e2c87078706", + "sha256:1249cbbf3d3b04902ff081ffbb33ce3377fa6e4c7356f759f3cd076cc138d020", + "sha256:1920d4ff15ce893210c1f0c0e9d19bfbecb7983c76b33f046c13a8ffbd570252", + "sha256:193cbc708ea3aca45e7221ae58f0fd63f933753a9bfb498a3b474878f12caaad", + "sha256:1a100c6d595a7f316f1b6f01d20815d916e75ff98c27a01ae817439ea7726329", + "sha256:1f30b48dd7fa1474554b0b0f3fdfdd4c13b5c737a3c6284d3cdc424ec0ffff3a", + "sha256:203f0c8871d5a7987be20c72442488a0b8cfd0f43b7973771640fc593f56321f", + "sha256:246de67b99b6851627d945db38147d1b209a899311b1305dd84916f2b88526c6", + "sha256:2dee8e57f052ef5353cf608e0b4c871aee320dd1b87d351c28764fc0ca55f9f4", + "sha256:2efb1bd13885392adfda4614c33d3b68dee4921fd0ac1d3988f8cbb7d589e72a", + "sha256:2f4ac36d8e2b4cc1aa71df3dd84ff8efbe3bfb97ac41242fbcfc053c67434f46", + "sha256:3170c9399da12c9dc66366e9d14da8bf7147e1e9d9ea566067bbce7bb74bd9c2", + "sha256:3b1613dd5aee995ec6d4c69f00378bbd07614702a315a2cf6c1d21461fe17c23", + "sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace", + "sha256:3bb7fda7260735efe66d5107fb7e6af6a7c04c7fce9b2514e04b7a74b06bf5dd", + "sha256:41b25eaa7d15909cf3ac4c96088c1f266a9a93ec44f87f1d13d4a0e86c81b982", + "sha256:45de3f87179c1823e6d9e32156fb14c1927fcc9aba21433f088fdfb555b77c10", + "sha256:46fb8c61d794b78ec7134a715a3e564aafc8f6b5e338417cb19fe9f57a5a9bf2", + "sha256:48021783bdf96e3d6de03a6e39a1171ed5bd7e8bb93fc84cc649d11490f87cea", + "sha256:4957669ef390f0e6719db3613ab3a7631e68424604a7b448f079bee145da6e09", + "sha256:5e86d77b090dbddbe78867a0275cb4df08ea195e660f1f7f13435a4649e954e5", + "sha256:6339d047dab2780cc6220f46306628e04d9750f02f983ddb37439ca47ced7149", + "sha256:681eb3d7e02e3c3655d1b16059fbfb605ac464c834a0c629048a30fad2b27489", + "sha256:6c409c0deba34f147f77efaa67b8e4bb83d2f11c8806405f76397ae5b8c0d1c9", + "sha256:7095f6fbfaa55defb6b733cfeb14efaae7a29f0b59d8cf213be4e7ca0b857b80", + "sha256:70c610f6cbe4b9fce272c407dd9d07e33e6bf7b4aa1b7ffb6f6ded8e634e3592", + "sha256:72814c01533f51d68702802d74f77ea026b5ec52793c791e2da806a3844a46c3", + "sha256:7a4826ad2bd6b07ca615c74ab91f32f6c96d08f6fcc3902ceeedaec8cdc3bcd6", + "sha256:7c70087bfee18a42b4040bb9ec1ca15a08242cf5867c58726530bdf3945672ed", + "sha256:855eafa5d5a2034b4621c74925d89c5efef61418570e5ef9b37717d9c796419c", + "sha256:8700f06d0ce6f128de3ccdbc1acaea1ee264d2caa9ca05daaf492fde7c2a7200", + "sha256:89f1b185a01fe560bc8ae5f619e924407efca2191b56ce749ec84982fc59a32a", + "sha256:8b2c760cfc7042b27ebdb4a43a4453bd829a5742503599144d54a032c5dc7e9e", + "sha256:8c2f5e83493748286002f9369f3e6607c565a6a90425a3a1fef5ae32a36d749d", + "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6", + "sha256:94aea8eff76ee6d1cdacb07dd2123a68283cb5569e0250feab1240058f53b623", + "sha256:95eb302ff792e12aba9a8b8f8474ab229a83c103d74a750ec0bd1c1eea32e669", + "sha256:9bd9b3b31adcb054116447ea22caa61a285d92e94d710aa5ec97992ff5eb7cf3", + "sha256:9e608aafdb55eb9f255034709e20d5a83b6d60c054df0802fa9c9883d0a937aa", + "sha256:a103b3a7069b62f5d4890ae1b8f0597618f628b286b03d4bc9195230b154bfa9", + "sha256:a386ebe437176aab38c041de1260cd3ea459c6ce5263594399880bbc398225b2", + "sha256:a38856a971c602f98472050165cea2cdc97709240373041b69030be15047691f", + "sha256:a401b4598e5d3f4a9a811f3daf42ee2291790c7f9d74b18d75d6e21dda98a1a1", + "sha256:a7647ebdfb9682b7bb97e2a5e7cb6ae735b1c25008a70b906aecca294ee96cf4", + "sha256:aaf63899c94de41fe3cf934601b0f7ccb6b428c6e4eeb80da72c58eab077b19a", + "sha256:b0dac0ff919ba34d4df1b6131f59ce95b08b9065233446be7e459f95554c0dc8", + "sha256:baacc6aee0b2ef6f3d308e197b5d7a81c0e70b06beae1f1fcacffdbd124fe0e3", + "sha256:bf420121d4c8dce6b889f0e8e4ec0ca34b7f40186203f06a946fa0276ba54029", + "sha256:c04a46716adde8d927adb9457bbe39cf473e1e2c2f5d0a16ceb837e5d841ad4f", + "sha256:c0b21078a4b56965e2b12f247467b234734491897e99c1d51cee628da9786959", + "sha256:c1c76a1743432b4b60ab3358c937a3fe1341c828ae6194108a94c69028247f22", + "sha256:c4983bf937209c57240cff65906b18bb35e64ae872da6a0db937d7b4af845dd7", + "sha256:c4fb39a81950ec280984b3a44f5bd12819953dc5fa3a7e6fa7a80db5ee853952", + "sha256:c57921cda3a80d0f2b8aec7e25c8aa14479ea92b5b51b6876d975d925a2ea346", + "sha256:c8063cf17b19661471ecbdb3df1c84f24ad2e389e326ccaf89e3fb2484d8dd7e", + "sha256:ccd16eb18a849fd8dcb23e23380e2f0a354e8daa0c984b8a732d9cfaba3a776d", + "sha256:cd6dbe0238f7743d0efe563ab46294f54f9bc8f4b9bcf57c3c666cc5bc9d1299", + "sha256:d62e51710986674142526ab9f78663ca2b0726066ae26b78b22e0f5e571238dd", + "sha256:db901e2ac34c931d73054d9797383d0f8009991e723dab15109740a63e7f902a", + "sha256:e03b8895a6990c9ab2cdcd0f2fe44088ca1c65ae592b8f795c3294af00a461c3", + "sha256:e1c8a2f4c69e08e89632defbfabec2feb8a8d99edc9f89ce33c4b9e36ab63037", + "sha256:e4b749b9cc6ee664a3300bb3a273c1ca8068c46be705b6c31cf5d276f8628a94", + "sha256:e6a5bf2cba5ae1bb80b154ed68a3cfa2fa00fde979a7f50d6598d3e17d9ac20c", + "sha256:e857a2232ba53ae940d3456f7533ce6ca98b81917d47adc3c7fd55dad8fab858", + "sha256:ee4006268ed33370957f55bf2e6f4d263eaf4dc3cfc473d1d90baff6ed36ce4a", + "sha256:eef9df1eefada2c09a5e7a40991b9fc6ac6ef20b1372abd48d2794a316dc0449", + "sha256:f058f6963fd82eb143c692cecdc89e075fa0828db2e5b291070485390b2f1c9c", + "sha256:f25c229a6ba38a35ae6e25ca1264621cc25d4d38dca2942a7fce0b67a4efe918", + "sha256:f2a1d0fd4242bd8643ce6f98927cf9c04540af6efa92323e9d3124f57727bfc1", + "sha256:f7560358a6811e52e9c4d142d497f1a6e10103d3a6881f18d04dbce3729c0e2c", + "sha256:f779d3ad205f108d14e99bb3859aa7dd8e9c68874617c72354d7ecaec2a054ac", + "sha256:f87f746ee241d30d6ed93969de31e5ffd09a2961a051e60ae6bddde9ec3583aa" + ], + "markers": "python_full_version >= '3.7.0'", + "version": "==3.2.0" + }, + "idna": { + "hashes": [ + "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", + "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2" + ], + "markers": "python_version >= '3.5'", + "version": "==3.4" + }, + "requests": { + "hashes": [ + "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f", + "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1" + ], + "index": "pypi", + "version": "==2.31.0" + }, + "soupsieve": { + "hashes": [ + "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8", + "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea" + ], + "markers": "python_version >= '3.7'", + "version": "==2.4.1" + }, + "urllib3": { + "hashes": [ + "sha256:8d22f86aae8ef5e410d4f539fde9ce6b2113a001bb4d189e0aed70642d602b11", + "sha256:de7df1803967d2c2a98e4b11bb7d6bd9210474c46e8a0401514e3a42a75ebde4" + ], + "markers": "python_version >= '3.7'", + "version": "==2.0.4" + } + }, + "develop": {} +} diff --git a/bf_helloworld.py b/bf_helloworld.py new file mode 100644 index 0000000..8d05559 --- /dev/null +++ b/bf_helloworld.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +import os,sys +from pprint import pprint +from bs4 import BeautifulSoup +import requests + +main_url = "https://www.mchk.org.hk/english/list_register/list.php?page=4&ipp=20&type=L" +req = requests.get(main_url) +soup = BeautifulSoup(req.text, "html.parser") + +title = soup.find("h1") +# print(title.get_text()) + +trs = soup.select("table#Table_5 table tr") +for tr in trs[2:]: + tds = tr.select('td') + + try: + # pprint(tr) + + # pprint(tds) + # print() + # # reg # + # pprint(tds[0]) + # # name + # pprint(tds[1]) + + # # pprint(tds[2]) + # # pprint(tds[3]) + # # pprint(tds[4]) + + # # Nature of qualification + (tds[5]) + + # # year + # pprint(tds[7]) + # print() + # print() + # print() + + except Exception as e: + print('error') + print(tds[0]) + break + pass + +# tds = tr.select('td') +# pprint(tds) +# print() +# print() +# print() +# break +# # print(tds) + +# pprint('helloworld') + diff --git a/bf_helloworld_1-extract.py b/bf_helloworld_1-extract.py new file mode 100644 index 0000000..8ee1c30 --- /dev/null +++ b/bf_helloworld_1-extract.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +import os,sys +from pprint import pprint +from bs4 import BeautifulSoup +import requests + +main_url = "https://www.mchk.org.hk/english/list_register/list.php?page=4&ipp=20&type=L" +req = requests.get(main_url) +soup = BeautifulSoup(req.text, "html.parser") + +title = soup.find("h1") + +trs = soup.select("table#Table_5 table tr") + +for tr in trs[2:]: + try: + # name + tds = tr.select('td[headers="reg"]') + if (len(tds) > 0): + # reg field got something + print(tds[0].get_text()) + + # # name + tds = tr.select('td[headers="name"]') + print(tds[0].get_text()) + + # # qualification + tds = tr.select('td[headers="qualification"]') + print(tds[0].get_text()) + print(tds[1].get_text()) + pass + else: + # reg field got nothing + tds = tr.select('td[headers="qualification"]') + for i in range(0, len(tds), 2): + pprint(f'{tds[i].get_text()}, {tds[i+1].get_text()}') + pass + except Exception as e: + print(tr) + print(e) + print('error') + pass + diff --git a/bf_helloworld_1.py b/bf_helloworld_1.py new file mode 100644 index 0000000..80d4ecc --- /dev/null +++ b/bf_helloworld_1.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +import os,sys +from pprint import pprint +from bs4 import BeautifulSoup +import requests +import json + +output = [] + +def parse_tbl(page_no=1): + main_url = f"https://www.mchk.org.hk/english/list_register/list.php?page={page_no}&ipp=20&type=L" + req = requests.get(main_url) + soup = BeautifulSoup(req.text, "html.parser") + + title = soup.find("h1") + + trs = soup.select("table#Table_5 table tr") + + + doctor_info=[] + + for tr in trs[2:]: + try: + # name + tds = tr.select('td[headers="reg"]') + + if (len(tds) > 0): + # reg field got something + reg = tds[0].get_text() + + # # name + tds = tr.select('td[headers="name"]') + name = tds[0].get_text() + + # address + tds = tr.select('td[headers="date"]') + address = tds[0].get_text() + + # # qualification + tds = tr.select('td[headers="qualification"]') + qualification_0 = tds[0].get_text() + qualification_1 = tds[1].get_text() + + doctor_info = [ + reg,name, address, [ + [qualification_0, qualification_1] + ] + ] + + output.append(doctor_info) + doctor_info = [] + pass + else: + # reg field got nothing + tds = tr.select('td[headers="qualification"]') + for i in range(0, len(tds), 2): + # pprint(f'{tds[i].get_text()}, {tds[i+1].get_text()}') + qualification_0 = tds[i].get_text() + qualification_1 = tds[i+1].get_text() + qualifications = [qualification_0, qualification_1] + # get last doctor + last_doctor_idx = len(output)-1 + output[last_doctor_idx][3].append(qualifications) + pass + + except Exception as e: + print(tr) + print('error') + pass + +for i in range(1,767+1): + try: + print(f'parsing {i}...') + parse_tbl(i) + except Exception as e: + print(i) + pass + +with open ('output.json','a+', encoding='utf8') as fo: + fo.truncate(0) + json.dump(output, fo, ensure_ascii=False) diff --git a/bf_helloworld_old.py b/bf_helloworld_old.py new file mode 100644 index 0000000..1ecbe8b --- /dev/null +++ b/bf_helloworld_old.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +import os,sys +from pprint import pprint +from bs4 import BeautifulSoup +import requests + +main_url = "https://www.mchk.org.hk/english/list_register/list.php?page=4&ipp=20&type=L" +req = requests.get(main_url) +soup = BeautifulSoup(req.text, "html.parser") + +title = soup.find("h1") +# print(title.get_text()) + +trs = soup.select("table#Table_5 table tr") +for tr in trs[2:]: + try: + # name + tds = tr.select('td[headers="reg"]') + if (len(tds) > 0): + # reg field got something + print(tds[0].get_text()) + + # # name + tds = tr.select('td[headers="name"]') + print(tds[0].get_text()) + + # # qualification + tds = tr.select('td[headers="qualification"]') + print(tds[0].get_text()) + print(tds[1].get_text()) + pass + else: + # reg field got nothing + + pass + except Exception as e: + print(tr) + print(e) + print('error') + pass +# tds = tr.select('td') +# pprint(tds) +# print() +# print() +# print() +# break +# # print(tds) + +# pprint('helloworld') + diff --git a/docs/.gitkeep b/docs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/Screenshot from 2023-07-31 23-28-33.png b/docs/Screenshot from 2023-07-31 23-28-33.png new file mode 100644 index 0000000..9a8e4a3 --- /dev/null +++ b/docs/Screenshot from 2023-07-31 23-28-33.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84a2fc597d09d1f0aefdccbed74ba75b14d440a2087e071fdd3ebb58c96925d8 +size 119501 diff --git a/gitUpdate.bat b/gitUpdate.bat new file mode 100644 index 0000000..8d93180 --- /dev/null +++ b/gitUpdate.bat @@ -0,0 +1,7 @@ +git status . + +@pause + +git add . +git commit -m"update hkfatimah," +start git push \ No newline at end of file diff --git a/history.md b/history.md new file mode 100644 index 0000000..bd6e724 --- /dev/null +++ b/history.md @@ -0,0 +1,5 @@ +python tutorial + +HKD 250/hr + +https://www.mchk.org.hk/english/list_register/list.php diff --git a/journey.md b/journey.md new file mode 100644 index 0000000..c5141e1 --- /dev/null +++ b/journey.md @@ -0,0 +1,15 @@ +- helloworld + +- https://www.mchk.org.hk/english/list_register/list.php?type=L + +- from page structure, get link: + - https://www.mchk.org.hk/english/list_register/list.php?page=1&ipp=20&type=L + +history: + +- bf_helloworld_old.py +- bf_helloworld_1.py +- bf_helloworld_1-extract.py +- bf_helloworld.py + +- https://realpython.com/python-mutable-vs-immutable-types/ diff --git a/main.py b/main.py new file mode 100644 index 0000000..80d4ecc --- /dev/null +++ b/main.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +import os,sys +from pprint import pprint +from bs4 import BeautifulSoup +import requests +import json + +output = [] + +def parse_tbl(page_no=1): + main_url = f"https://www.mchk.org.hk/english/list_register/list.php?page={page_no}&ipp=20&type=L" + req = requests.get(main_url) + soup = BeautifulSoup(req.text, "html.parser") + + title = soup.find("h1") + + trs = soup.select("table#Table_5 table tr") + + + doctor_info=[] + + for tr in trs[2:]: + try: + # name + tds = tr.select('td[headers="reg"]') + + if (len(tds) > 0): + # reg field got something + reg = tds[0].get_text() + + # # name + tds = tr.select('td[headers="name"]') + name = tds[0].get_text() + + # address + tds = tr.select('td[headers="date"]') + address = tds[0].get_text() + + # # qualification + tds = tr.select('td[headers="qualification"]') + qualification_0 = tds[0].get_text() + qualification_1 = tds[1].get_text() + + doctor_info = [ + reg,name, address, [ + [qualification_0, qualification_1] + ] + ] + + output.append(doctor_info) + doctor_info = [] + pass + else: + # reg field got nothing + tds = tr.select('td[headers="qualification"]') + for i in range(0, len(tds), 2): + # pprint(f'{tds[i].get_text()}, {tds[i+1].get_text()}') + qualification_0 = tds[i].get_text() + qualification_1 = tds[i+1].get_text() + qualifications = [qualification_0, qualification_1] + # get last doctor + last_doctor_idx = len(output)-1 + output[last_doctor_idx][3].append(qualifications) + pass + + except Exception as e: + print(tr) + print('error') + pass + +for i in range(1,767+1): + try: + print(f'parsing {i}...') + parse_tbl(i) + except Exception as e: + print(i) + pass + +with open ('output.json','a+', encoding='utf8') as fo: + fo.truncate(0) + json.dump(output, fo, ensure_ascii=False) diff --git a/package.json b/package.json new file mode 100644 index 0000000..a3ce736 --- /dev/null +++ b/package.json @@ -0,0 +1,13 @@ +{ + "name": "hkfatimah", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1", + "gitUpdate": "git add . && git commit -m\"update hkfatimah,\"" + }, + "keywords": [], + "author": "", + "license": "ISC" +} \ No newline at end of file