update,

2025-01-31 19:51:04 +08:00
parent 4c9568fd60
commit 90bb565f91
17 changed files with 372 additions and 0 deletions
--- a/goodtastesmartie/task1-deal-broken/ocr/Process2.py
+++ b/goodtastesmartie/task1-deal-broken/ocr/Process2.py
@@ -0,0 +1,85 @@
+import re
+import argparse
+import fitz
+import pytesseract
+import cv2
+import numpy as np
+
+def redact_text(text, validators, ignored):
+    redacted_text = text
+
+    # Apply validators to redact sensitive information
+    for validator_type, patterns in validators.items():
+        for pattern, replacement in patterns:
+            redacted_text = re.sub(pattern, replacement, redacted_text, flags=re.IGNORECASE)
+
+    # Apply ignored patterns to exclude certain patterns from redaction
+    for pattern in ignored:
+        redacted_text = re.sub(pattern, '', redacted_text, flags=re.IGNORECASE)
+
+    return redacted_text
+
+
+def process_pdf(input_file, output_file, validators, ignored):
+    doc = fitz.open(input_file)
+
+    for page_index, page in enumerate(doc):
+        pixmap = page.get_pixmap()
+
+        # Convert the pixmap to a NumPy array
+        image_np = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
+
+        # Convert the image from BGR to RGB format
+        image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+
+        # Use Tesseract OCR to extract text from the image
+        extracted_text = pytesseract.image_to_string(image_rgb)
+
+        # Apply redaction on the extracted text
+        redacted_text = redact_text(extracted_text, validators, ignored)
+
+        # Print the extracted and redacted text for verification
+        print(f"Page {page_index + 1} - Extracted Text:")
+        print(extracted_text)
+        print(f"Page {page_index + 1} - Redacted Text:")
+        print(redacted_text)
+        print("-----")
+
+    doc.save(output_file)
+    doc.close()
+
+    print("Redacted PDF saved successfully!")
+
+# Define the validators and ignored patterns
+validators = {
+
+    "Hong Kong Identity Card Number": [
+        (r'\b[A-Z]\d{6}\([A-Z]\)', 'XXXXXXXXX'),
+        (r'\b[A-Z]\d{6}\(\d\)', 'XXXXXXXXX')
+    ],
+    "Phone Numbers": [
+        (r'\+\d{1,3}[-\s]?\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', '+XXX XXXX XXXX'),
+        (r'\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', 'XXXX XXXX')
+    ],
+      "Credit Card Numbers": [
+        (r'\b[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}\b', 'XXXX-XXXX-XXXX-XXXX')
+    ],
+    "Email Addresses": [
+        (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', 'XXXXX@XXXXX.XXX')
+    ],
+
+}
+
+ignored = [
+    r'0000[-\s]?0000[-\s]?0000[-\s]?0000',
+    r'1111[-\s]?1111[-\s]?1111[-\s]?1111'
+]
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description='PDF Redaction Script')
+parser.add_argument('input_file', help='Path to the input PDF file')
+parser.add_argument('output_file', help='Path to the output PDF file')
+args = parser.parse_args()
+
+# Call the process_pdf function with input and output file paths, validators, and ignored patterns
+process_pdf(args.input_file, args.output_file, validators, ignored)