This commit is contained in:
louiscklaw
2025-01-31 19:51:04 +08:00
parent 4c9568fd60
commit 90bb565f91
17 changed files with 372 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
git status .
@pause
git add .
git commit -m"update goodtastesmartie,"
start git push

31
goodtastesmartie/meta.md Normal file
View File

@@ -0,0 +1,31 @@
---
tags: python, contur-detection, credit-card, canny-detection
---
### task1 - deal broken
- deal broken as sensitive information
### samples
https://universe.roboflow.com/search?q=credit%20card%20images%3E300
No modification and i used to rewrite
so, to solidify the requirements.
Using python,
what you want is accepting the image input containing an credit card
return the:
- card number and
- "valid thru"(expiry date)
on the vision given ?
### bottom line
- won't recognize or extract information from HKID card image
and for this kind of image recognization, not guaranteed to have 100% accuracy as it's based on OCR

View File

@@ -0,0 +1,95 @@
import re
import argparse
import fitz
import pytesseract
import cv2
import numpy as np
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
def redact_text(text, validators, ignored):
redacted_text = text
for validator_type, patterns in validators.items():
for pattern, replacement in patterns:
redacted_text, _ = re.subn(pattern, replacement, redacted_text, flags=re.IGNORECASE)
print(f"{validator_type}: {redacted_text}")
for pattern in ignored:
redacted_text, _ = re.subn(pattern, '', redacted_text, flags=re.IGNORECASE)
print(f"Ignored: {redacted_text}")
return redacted_text
def process_pdf(input_file, output_file, validators, ignored):
doc = fitz.open(input_file)
for page_index, page in enumerate(doc):
pixmap = page.get_pixmap()
image_np = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
extracted_text = pytesseract.image_to_string(image_rgb)
redacted_text = redact_text(extracted_text, validators, ignored)
c = canvas.Canvas("temp.pdf", pagesize=letter)
c.setPageSize((pixmap.w, pixmap.h))
c.setFont("Helvetica", 12)
for line in redacted_text.split("\n"):
if line.strip():
c.setFillColorRGB(0, 0, 0)
c.rect(0, pixmap.h - 16, pixmap.w, 16, fill=True)
c.setFillColorRGB(1, 1, 1)
c.drawString(6, pixmap.h - 11, line)
c.showPage()
c.save()
redacted_page = fitz.open("temp.pdf")
page_text = redacted_page[0].get_text()
if page_text:
page.insert_text((0, 0), page_text)
doc.save(output_file)
doc.close()
redacted_page.close()
print("Redacted PDF saved successfully!")
# Define the validators and ignored patterns
validators = {
"Hong Kong Identity Card Number": [
(r'\b[A-Z]\d{6}\([A-Z]\)', 'XXXXXXXXX'),
(r'\b[A-Z]\d{6}\(\d\)', 'XXXXXXXXX')
],
"Phone Numbers": [
(r'\+\d{1,3}[-\s]?\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', '+XXX XXXX XXXX'),
(r'\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', 'XXXX XXXX')
],
"Credit Card Numbers": [
(r'\b[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}\b', 'XXXX-XXXX-XXXX-XXXX')
],
"Email Addresses": [
(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', 'XXXXX@XXXXX.XXX')
],
}
ignored = [
r'0000[-\s]?0000[-\s]?0000[-\s]?0000',
r'1111[-\s]?1111[-\s]?1111[-\s]?1111'
]
# Parse command-line arguments
parser = argparse.ArgumentParser(description='PDF Redaction Script')
parser.add_argument('input_file', help='Path to the input PDF file')
parser.add_argument('output_file', help='Path to the output PDF file')
args = parser.parse_args()
# Call the process_pdf function with input and output file paths, validators, and ignored patterns
process_pdf(args.input_file, args.output_file, validators, ignored)

View File

@@ -0,0 +1,85 @@
import re
import argparse
import fitz
import pytesseract
import cv2
import numpy as np
def redact_text(text, validators, ignored):
redacted_text = text
# Apply validators to redact sensitive information
for validator_type, patterns in validators.items():
for pattern, replacement in patterns:
redacted_text = re.sub(pattern, replacement, redacted_text, flags=re.IGNORECASE)
# Apply ignored patterns to exclude certain patterns from redaction
for pattern in ignored:
redacted_text = re.sub(pattern, '', redacted_text, flags=re.IGNORECASE)
return redacted_text
def process_pdf(input_file, output_file, validators, ignored):
doc = fitz.open(input_file)
for page_index, page in enumerate(doc):
pixmap = page.get_pixmap()
# Convert the pixmap to a NumPy array
image_np = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
# Convert the image from BGR to RGB format
image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
# Use Tesseract OCR to extract text from the image
extracted_text = pytesseract.image_to_string(image_rgb)
# Apply redaction on the extracted text
redacted_text = redact_text(extracted_text, validators, ignored)
# Print the extracted and redacted text for verification
print(f"Page {page_index + 1} - Extracted Text:")
print(extracted_text)
print(f"Page {page_index + 1} - Redacted Text:")
print(redacted_text)
print("-----")
doc.save(output_file)
doc.close()
print("Redacted PDF saved successfully!")
# Define the validators and ignored patterns
validators = {
"Hong Kong Identity Card Number": [
(r'\b[A-Z]\d{6}\([A-Z]\)', 'XXXXXXXXX'),
(r'\b[A-Z]\d{6}\(\d\)', 'XXXXXXXXX')
],
"Phone Numbers": [
(r'\+\d{1,3}[-\s]?\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', '+XXX XXXX XXXX'),
(r'\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', 'XXXX XXXX')
],
"Credit Card Numbers": [
(r'\b[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}\b', 'XXXX-XXXX-XXXX-XXXX')
],
"Email Addresses": [
(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', 'XXXXX@XXXXX.XXX')
],
}
ignored = [
r'0000[-\s]?0000[-\s]?0000[-\s]?0000',
r'1111[-\s]?1111[-\s]?1111[-\s]?1111'
]
# Parse command-line arguments
parser = argparse.ArgumentParser(description='PDF Redaction Script')
parser.add_argument('input_file', help='Path to the input PDF file')
parser.add_argument('output_file', help='Path to the output PDF file')
args = parser.parse_args()
# Call the process_pdf function with input and output file paths, validators, and ignored patterns
process_pdf(args.input_file, args.output_file, validators, ignored)

Binary file not shown.

Binary file not shown.

BIN
goodtastesmartie/task1-deal-broken/ocr/docs/sample.png (Stored with Git LFS) Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,107 @@
import sys
import fitz # PyMuPDF
from docx import Document
import cv2
import pytesseract
import re
def redact_sensitive_info(text, validators, ignored):
redacted_text = text
for validator, regex_matches in validators.items():
for regex, replacement in regex_matches:
matches = re.finditer(regex, redacted_text)
for match in matches:
sensitive_info = match.group()
if not any(re.search(ignore_regex, sensitive_info) for ignore_regex in ignored):
redacted_text = redacted_text.replace(sensitive_info, replacement)
return redacted_text
def save_as_pdf(doc, output_file):
try:
doc.save(output_file)
print("Redacted document saved as:", output_file)
except Exception as e:
print("Error saving PDF:", e)
def save_as_docx(doc, output_file):
try:
doc.save(output_file)
print("Redacted document saved as:", output_file)
except Exception as e:
print("Error saving DOCX:", e)
def process_pdf(input_file, output_file, validators, ignored):
doc = fitz.open(input_file)
output_text = ""
for page_number in range(len(doc)):
page = doc.load_page(page_number)
text = page.get_text()
output_text += text + "\n" # Concatenate text from all pages
redacted_text = redact_sensitive_info(output_text, validators, ignored)
doc = Document()
doc.add_paragraph(redacted_text)
if output_file.endswith('.pdf'):
save_as_pdf(doc, output_file)
elif output_file.endswith('.docx'):
save_as_docx(doc, output_file)
def process_docx(input_file, output_file, validators, ignored):
doc = Document(input_file)
paragraphs = [paragraph.text for paragraph in doc.paragraphs]
text = '\n'.join(paragraphs)
redacted_text = redact_sensitive_info(text, validators, ignored)
doc = Document()
doc.add_paragraph(redacted_text)
if output_file.endswith('.pdf'):
save_as_pdf(doc, output_file)
elif output_file.endswith('.docx'):
save_as_docx(doc, output_file)
def process_image(input_file, output_file, validators, ignored):
image = cv2.imread(input_file)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
text = pytesseract.image_to_string(binary)
redacted_text = redact_sensitive_info(text, validators, ignored)
doc = Document()
doc.add_paragraph(redacted_text)
if output_file.endswith('.pdf'):
save_as_pdf(doc, output_file)
elif output_file.endswith('.docx'):
save_as_docx(doc, output_file)
def main():
if len(sys.argv) != 3:
print("Usage: python script.py <input_file> <output_file>")
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
validators = {
"Hong Kong Identity Card Number": [
(r'\b[A-Z]\d{6}[\(][(A-Z0-9][\)]', 'XXXXXXXXX'),
(r'\b[A-Z][A-Z]\d{6}\(\d\)', 'XXXXXXXXXX')
],
"Credit card number": [
(r'\b(?:5[1-5]\d{2}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}|2(?:2[2-7]\d{2}|7[3-9]\d{2})[- ]?\d{4}[- ]?\d{4}[- ]?\d{4})\b', 'XXXX-XXXX-XXXX-XXXX')
]
}
ignored = [
r'0000([- ]?)0000([ -]?)0000([ -]?)0000',
r'1111([- ]?)1111([ -]?)1111([ -]?)1111',
]
if input_file.endswith('.pdf'):
process_pdf(input_file, output_file, validators, ignored)
elif input_file.endswith('.docx'):
process_docx(input_file, output_file, validators, ignored)
else:
process_image(input_file, output_file, validators, ignored)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,38 @@
# OCR Sensitive Information Redaction
This project is a Python script for redacting sensitive information from documents using Optical Character Recognition (OCR).
It takes as input documents in various formats (PDF, DOCX, images) containing sensitive information such as credit card numbers and Hong Kong Identity Card numbers,
and redacts this information before saving the redacted document in the desired format.
## Installation
1. Copy the pythontransform.py in your local machine.
2. Install the required Python libraries including `opencv-python`, `PyMuPDF` (for PDF processing), `python-docx` (for DOCX processing),
and `pytesseract` (for OCR).
## Usage
To run the script, use the following command:
python pythontransform.py <input_file> <output_file>
Replace `<input_file>` with the path to the input document you want to redact, and `<output_file>` with the desired path for the redacted document.
For example:
python pythontransform.py input_document.pdf redacted_document.docx
This will redact sensitive information from the input PDF file `input_document.pdf` and save the redacted document as `redacted_document.docx`.
## Supported Formats
The script supports input documents in the following formats:
- PDF
- DOCX
- Images (PNG, JPEG, etc.)
The output format for the redacted document is in DOCX format.