108 lines
3.6 KiB
Python
108 lines
3.6 KiB
Python
import sys
|
|
import fitz # PyMuPDF
|
|
from docx import Document
|
|
import cv2
|
|
import pytesseract
|
|
import re
|
|
|
|
def redact_sensitive_info(text, validators, ignored):
|
|
redacted_text = text
|
|
for validator, regex_matches in validators.items():
|
|
for regex, replacement in regex_matches:
|
|
matches = re.finditer(regex, redacted_text)
|
|
for match in matches:
|
|
sensitive_info = match.group()
|
|
if not any(re.search(ignore_regex, sensitive_info) for ignore_regex in ignored):
|
|
redacted_text = redacted_text.replace(sensitive_info, replacement)
|
|
return redacted_text
|
|
|
|
|
|
def save_as_pdf(doc, output_file):
|
|
try:
|
|
doc.save(output_file)
|
|
print("Redacted document saved as:", output_file)
|
|
except Exception as e:
|
|
print("Error saving PDF:", e)
|
|
|
|
def save_as_docx(doc, output_file):
|
|
try:
|
|
doc.save(output_file)
|
|
print("Redacted document saved as:", output_file)
|
|
except Exception as e:
|
|
print("Error saving DOCX:", e)
|
|
|
|
def process_pdf(input_file, output_file, validators, ignored):
|
|
doc = fitz.open(input_file)
|
|
output_text = ""
|
|
for page_number in range(len(doc)):
|
|
page = doc.load_page(page_number)
|
|
text = page.get_text()
|
|
output_text += text + "\n" # Concatenate text from all pages
|
|
redacted_text = redact_sensitive_info(output_text, validators, ignored)
|
|
doc = Document()
|
|
doc.add_paragraph(redacted_text)
|
|
|
|
if output_file.endswith('.pdf'):
|
|
save_as_pdf(doc, output_file)
|
|
elif output_file.endswith('.docx'):
|
|
save_as_docx(doc, output_file)
|
|
|
|
def process_docx(input_file, output_file, validators, ignored):
|
|
doc = Document(input_file)
|
|
paragraphs = [paragraph.text for paragraph in doc.paragraphs]
|
|
text = '\n'.join(paragraphs)
|
|
redacted_text = redact_sensitive_info(text, validators, ignored)
|
|
doc = Document()
|
|
doc.add_paragraph(redacted_text)
|
|
|
|
if output_file.endswith('.pdf'):
|
|
save_as_pdf(doc, output_file)
|
|
elif output_file.endswith('.docx'):
|
|
save_as_docx(doc, output_file)
|
|
|
|
def process_image(input_file, output_file, validators, ignored):
|
|
image = cv2.imread(input_file)
|
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
|
_, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
|
|
text = pytesseract.image_to_string(binary)
|
|
redacted_text = redact_sensitive_info(text, validators, ignored)
|
|
doc = Document()
|
|
doc.add_paragraph(redacted_text)
|
|
|
|
if output_file.endswith('.pdf'):
|
|
save_as_pdf(doc, output_file)
|
|
elif output_file.endswith('.docx'):
|
|
save_as_docx(doc, output_file)
|
|
|
|
def main():
|
|
if len(sys.argv) != 3:
|
|
print("Usage: python script.py <input_file> <output_file>")
|
|
sys.exit(1)
|
|
|
|
input_file = sys.argv[1]
|
|
output_file = sys.argv[2]
|
|
|
|
validators = {
|
|
"Hong Kong Identity Card Number": [
|
|
(r'\b[A-Z]\d{6}[\(][(A-Z0-9][\)]', 'XXXXXXXXX'),
|
|
(r'\b[A-Z][A-Z]\d{6}\(\d\)', 'XXXXXXXXXX')
|
|
],
|
|
"Credit card number": [
|
|
(r'\b(?:5[1-5]\d{2}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}|2(?:2[2-7]\d{2}|7[3-9]\d{2})[- ]?\d{4}[- ]?\d{4}[- ]?\d{4})\b', 'XXXX-XXXX-XXXX-XXXX')
|
|
]
|
|
}
|
|
ignored = [
|
|
r'0000([- ]?)0000([ -]?)0000([ -]?)0000',
|
|
r'1111([- ]?)1111([ -]?)1111([ -]?)1111',
|
|
]
|
|
|
|
if input_file.endswith('.pdf'):
|
|
process_pdf(input_file, output_file, validators, ignored)
|
|
elif input_file.endswith('.docx'):
|
|
process_docx(input_file, output_file, validators, ignored)
|
|
else:
|
|
process_image(input_file, output_file, validators, ignored)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|