import re import argparse import fitz import pytesseract import cv2 import numpy as np from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas def redact_text(text, validators, ignored): redacted_text = text for validator_type, patterns in validators.items(): for pattern, replacement in patterns: redacted_text, _ = re.subn(pattern, replacement, redacted_text, flags=re.IGNORECASE) print(f"{validator_type}: {redacted_text}") for pattern in ignored: redacted_text, _ = re.subn(pattern, '', redacted_text, flags=re.IGNORECASE) print(f"Ignored: {redacted_text}") return redacted_text def process_pdf(input_file, output_file, validators, ignored): doc = fitz.open(input_file) for page_index, page in enumerate(doc): pixmap = page.get_pixmap() image_np = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB) extracted_text = pytesseract.image_to_string(image_rgb) redacted_text = redact_text(extracted_text, validators, ignored) c = canvas.Canvas("temp.pdf", pagesize=letter) c.setPageSize((pixmap.w, pixmap.h)) c.setFont("Helvetica", 12) for line in redacted_text.split("\n"): if line.strip(): c.setFillColorRGB(0, 0, 0) c.rect(0, pixmap.h - 16, pixmap.w, 16, fill=True) c.setFillColorRGB(1, 1, 1) c.drawString(6, pixmap.h - 11, line) c.showPage() c.save() redacted_page = fitz.open("temp.pdf") page_text = redacted_page[0].get_text() if page_text: page.insert_text((0, 0), page_text) doc.save(output_file) doc.close() redacted_page.close() print("Redacted PDF saved successfully!") # Define the validators and ignored patterns validators = { "Hong Kong Identity Card Number": [ (r'\b[A-Z]\d{6}\([A-Z]\)', 'XXXXXXXXX'), (r'\b[A-Z]\d{6}\(\d\)', 'XXXXXXXXX') ], "Phone Numbers": [ (r'\+\d{1,3}[-\s]?\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', '+XXX XXXX XXXX'), (r'\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', 'XXXX XXXX') ], "Credit Card Numbers": [ (r'\b[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}\b', 'XXXX-XXXX-XXXX-XXXX') ], "Email Addresses": [ (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', 'XXXXX@XXXXX.XXX') ], } ignored = [ r'0000[-\s]?0000[-\s]?0000[-\s]?0000', r'1111[-\s]?1111[-\s]?1111[-\s]?1111' ] # Parse command-line arguments parser = argparse.ArgumentParser(description='PDF Redaction Script') parser.add_argument('input_file', help='Path to the input PDF file') parser.add_argument('output_file', help='Path to the output PDF file') args = parser.parse_args() # Call the process_pdf function with input and output file paths, validators, and ignored patterns process_pdf(args.input_file, args.output_file, validators, ignored)