update,
This commit is contained in:
95
goodtastesmartie/task1-deal-broken/ocr/Pdfredaction.py
Normal file
95
goodtastesmartie/task1-deal-broken/ocr/Pdfredaction.py
Normal file
@@ -0,0 +1,95 @@
|
||||
import re
|
||||
import argparse
|
||||
import fitz
|
||||
import pytesseract
|
||||
import cv2
|
||||
import numpy as np
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from reportlab.pdfgen import canvas
|
||||
|
||||
|
||||
def redact_text(text, validators, ignored):
|
||||
redacted_text = text
|
||||
|
||||
for validator_type, patterns in validators.items():
|
||||
for pattern, replacement in patterns:
|
||||
redacted_text, _ = re.subn(pattern, replacement, redacted_text, flags=re.IGNORECASE)
|
||||
print(f"{validator_type}: {redacted_text}")
|
||||
|
||||
for pattern in ignored:
|
||||
redacted_text, _ = re.subn(pattern, '', redacted_text, flags=re.IGNORECASE)
|
||||
print(f"Ignored: {redacted_text}")
|
||||
|
||||
return redacted_text
|
||||
|
||||
|
||||
def process_pdf(input_file, output_file, validators, ignored):
|
||||
doc = fitz.open(input_file)
|
||||
|
||||
for page_index, page in enumerate(doc):
|
||||
pixmap = page.get_pixmap()
|
||||
|
||||
image_np = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
||||
image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
|
||||
extracted_text = pytesseract.image_to_string(image_rgb)
|
||||
redacted_text = redact_text(extracted_text, validators, ignored)
|
||||
|
||||
c = canvas.Canvas("temp.pdf", pagesize=letter)
|
||||
c.setPageSize((pixmap.w, pixmap.h))
|
||||
|
||||
c.setFont("Helvetica", 12)
|
||||
for line in redacted_text.split("\n"):
|
||||
if line.strip():
|
||||
c.setFillColorRGB(0, 0, 0)
|
||||
c.rect(0, pixmap.h - 16, pixmap.w, 16, fill=True)
|
||||
c.setFillColorRGB(1, 1, 1)
|
||||
c.drawString(6, pixmap.h - 11, line)
|
||||
|
||||
c.showPage()
|
||||
|
||||
c.save()
|
||||
|
||||
redacted_page = fitz.open("temp.pdf")
|
||||
page_text = redacted_page[0].get_text()
|
||||
|
||||
if page_text:
|
||||
page.insert_text((0, 0), page_text)
|
||||
|
||||
doc.save(output_file)
|
||||
doc.close()
|
||||
redacted_page.close()
|
||||
|
||||
print("Redacted PDF saved successfully!")
|
||||
|
||||
|
||||
# Define the validators and ignored patterns
|
||||
validators = {
|
||||
"Hong Kong Identity Card Number": [
|
||||
(r'\b[A-Z]\d{6}\([A-Z]\)', 'XXXXXXXXX'),
|
||||
(r'\b[A-Z]\d{6}\(\d\)', 'XXXXXXXXX')
|
||||
],
|
||||
"Phone Numbers": [
|
||||
(r'\+\d{1,3}[-\s]?\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', '+XXX XXXX XXXX'),
|
||||
(r'\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', 'XXXX XXXX')
|
||||
],
|
||||
"Credit Card Numbers": [
|
||||
(r'\b[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}\b', 'XXXX-XXXX-XXXX-XXXX')
|
||||
],
|
||||
"Email Addresses": [
|
||||
(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', 'XXXXX@XXXXX.XXX')
|
||||
],
|
||||
}
|
||||
|
||||
ignored = [
|
||||
r'0000[-\s]?0000[-\s]?0000[-\s]?0000',
|
||||
r'1111[-\s]?1111[-\s]?1111[-\s]?1111'
|
||||
]
|
||||
|
||||
# Parse command-line arguments
|
||||
parser = argparse.ArgumentParser(description='PDF Redaction Script')
|
||||
parser.add_argument('input_file', help='Path to the input PDF file')
|
||||
parser.add_argument('output_file', help='Path to the output PDF file')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Call the process_pdf function with input and output file paths, validators, and ignored patterns
|
||||
process_pdf(args.input_file, args.output_file, validators, ignored)
|
Reference in New Issue
Block a user