import sys import fitz # PyMuPDF from docx import Document import cv2 import pytesseract import re def redact_sensitive_info(text, validators, ignored): redacted_text = text for validator, regex_matches in validators.items(): for regex, replacement in regex_matches: matches = re.finditer(regex, redacted_text) for match in matches: sensitive_info = match.group() if not any(re.search(ignore_regex, sensitive_info) for ignore_regex in ignored): redacted_text = redacted_text.replace(sensitive_info, replacement) return redacted_text def save_as_pdf(doc, output_file): try: doc.save(output_file) print("Redacted document saved as:", output_file) except Exception as e: print("Error saving PDF:", e) def save_as_docx(doc, output_file): try: doc.save(output_file) print("Redacted document saved as:", output_file) except Exception as e: print("Error saving DOCX:", e) def process_pdf(input_file, output_file, validators, ignored): doc = fitz.open(input_file) output_text = "" for page_number in range(len(doc)): page = doc.load_page(page_number) text = page.get_text() output_text += text + "\n" # Concatenate text from all pages redacted_text = redact_sensitive_info(output_text, validators, ignored) doc = Document() doc.add_paragraph(redacted_text) if output_file.endswith('.pdf'): save_as_pdf(doc, output_file) elif output_file.endswith('.docx'): save_as_docx(doc, output_file) def process_docx(input_file, output_file, validators, ignored): doc = Document(input_file) paragraphs = [paragraph.text for paragraph in doc.paragraphs] text = '\n'.join(paragraphs) redacted_text = redact_sensitive_info(text, validators, ignored) doc = Document() doc.add_paragraph(redacted_text) if output_file.endswith('.pdf'): save_as_pdf(doc, output_file) elif output_file.endswith('.docx'): save_as_docx(doc, output_file) def process_image(input_file, output_file, validators, ignored): image = cv2.imread(input_file) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY) text = pytesseract.image_to_string(binary) redacted_text = redact_sensitive_info(text, validators, ignored) doc = Document() doc.add_paragraph(redacted_text) if output_file.endswith('.pdf'): save_as_pdf(doc, output_file) elif output_file.endswith('.docx'): save_as_docx(doc, output_file) def main(): if len(sys.argv) != 3: print("Usage: python script.py ") sys.exit(1) input_file = sys.argv[1] output_file = sys.argv[2] validators = { "Hong Kong Identity Card Number": [ (r'\b[A-Z]\d{6}[\(][(A-Z0-9][\)]', 'XXXXXXXXX'), (r'\b[A-Z][A-Z]\d{6}\(\d\)', 'XXXXXXXXXX') ], "Credit card number": [ (r'\b(?:5[1-5]\d{2}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}|2(?:2[2-7]\d{2}|7[3-9]\d{2})[- ]?\d{4}[- ]?\d{4}[- ]?\d{4})\b', 'XXXX-XXXX-XXXX-XXXX') ] } ignored = [ r'0000([- ]?)0000([ -]?)0000([ -]?)0000', r'1111([- ]?)1111([ -]?)1111([ -]?)1111', ] if input_file.endswith('.pdf'): process_pdf(input_file, output_file, validators, ignored) elif input_file.endswith('.docx'): process_docx(input_file, output_file, validators, ignored) else: process_image(input_file, output_file, validators, ignored) if __name__ == "__main__": main()