Files
004_comission/goodtastesmartie/task1-deal-broken/ocr/pythontransform.py
louiscklaw 90bb565f91 update,
2025-01-31 19:51:04 +08:00

108 lines
3.6 KiB
Python

import sys
import fitz # PyMuPDF
from docx import Document
import cv2
import pytesseract
import re
def redact_sensitive_info(text, validators, ignored):
redacted_text = text
for validator, regex_matches in validators.items():
for regex, replacement in regex_matches:
matches = re.finditer(regex, redacted_text)
for match in matches:
sensitive_info = match.group()
if not any(re.search(ignore_regex, sensitive_info) for ignore_regex in ignored):
redacted_text = redacted_text.replace(sensitive_info, replacement)
return redacted_text
def save_as_pdf(doc, output_file):
try:
doc.save(output_file)
print("Redacted document saved as:", output_file)
except Exception as e:
print("Error saving PDF:", e)
def save_as_docx(doc, output_file):
try:
doc.save(output_file)
print("Redacted document saved as:", output_file)
except Exception as e:
print("Error saving DOCX:", e)
def process_pdf(input_file, output_file, validators, ignored):
doc = fitz.open(input_file)
output_text = ""
for page_number in range(len(doc)):
page = doc.load_page(page_number)
text = page.get_text()
output_text += text + "\n" # Concatenate text from all pages
redacted_text = redact_sensitive_info(output_text, validators, ignored)
doc = Document()
doc.add_paragraph(redacted_text)
if output_file.endswith('.pdf'):
save_as_pdf(doc, output_file)
elif output_file.endswith('.docx'):
save_as_docx(doc, output_file)
def process_docx(input_file, output_file, validators, ignored):
doc = Document(input_file)
paragraphs = [paragraph.text for paragraph in doc.paragraphs]
text = '\n'.join(paragraphs)
redacted_text = redact_sensitive_info(text, validators, ignored)
doc = Document()
doc.add_paragraph(redacted_text)
if output_file.endswith('.pdf'):
save_as_pdf(doc, output_file)
elif output_file.endswith('.docx'):
save_as_docx(doc, output_file)
def process_image(input_file, output_file, validators, ignored):
image = cv2.imread(input_file)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
text = pytesseract.image_to_string(binary)
redacted_text = redact_sensitive_info(text, validators, ignored)
doc = Document()
doc.add_paragraph(redacted_text)
if output_file.endswith('.pdf'):
save_as_pdf(doc, output_file)
elif output_file.endswith('.docx'):
save_as_docx(doc, output_file)
def main():
if len(sys.argv) != 3:
print("Usage: python script.py <input_file> <output_file>")
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
validators = {
"Hong Kong Identity Card Number": [
(r'\b[A-Z]\d{6}[\(][(A-Z0-9][\)]', 'XXXXXXXXX'),
(r'\b[A-Z][A-Z]\d{6}\(\d\)', 'XXXXXXXXXX')
],
"Credit card number": [
(r'\b(?:5[1-5]\d{2}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}|2(?:2[2-7]\d{2}|7[3-9]\d{2})[- ]?\d{4}[- ]?\d{4}[- ]?\d{4})\b', 'XXXX-XXXX-XXXX-XXXX')
]
}
ignored = [
r'0000([- ]?)0000([ -]?)0000([ -]?)0000',
r'1111([- ]?)1111([ -]?)1111([ -]?)1111',
]
if input_file.endswith('.pdf'):
process_pdf(input_file, output_file, validators, ignored)
elif input_file.endswith('.docx'):
process_docx(input_file, output_file, validators, ignored)
else:
process_image(input_file, output_file, validators, ignored)
if __name__ == "__main__":
main()