update,
This commit is contained in:
7
goodtastesmartie/gitUpdate.bat
Normal file
7
goodtastesmartie/gitUpdate.bat
Normal file
@@ -0,0 +1,7 @@
|
||||
git status .
|
||||
|
||||
@pause
|
||||
|
||||
git add .
|
||||
git commit -m"update goodtastesmartie,"
|
||||
start git push
|
31
goodtastesmartie/meta.md
Normal file
31
goodtastesmartie/meta.md
Normal file
@@ -0,0 +1,31 @@
|
||||
---
|
||||
tags: python, contur-detection, credit-card, canny-detection
|
||||
---
|
||||
|
||||
### task1 - deal broken
|
||||
|
||||
- deal broken as sensitive information
|
||||
|
||||
### samples
|
||||
|
||||
https://universe.roboflow.com/search?q=credit%20card%20images%3E300
|
||||
|
||||
No modification and i used to rewrite
|
||||
|
||||
so, to solidify the requirements.
|
||||
|
||||
Using python,
|
||||
what you want is accepting the image input containing an credit card
|
||||
|
||||
return the:
|
||||
|
||||
- card number and
|
||||
- "valid thru"(expiry date)
|
||||
|
||||
on the vision given ?
|
||||
|
||||
### bottom line
|
||||
|
||||
- won't recognize or extract information from HKID card image
|
||||
|
||||
and for this kind of image recognization, not guaranteed to have 100% accuracy as it's based on OCR
|
95
goodtastesmartie/task1-deal-broken/ocr/Pdfredaction.py
Normal file
95
goodtastesmartie/task1-deal-broken/ocr/Pdfredaction.py
Normal file
@@ -0,0 +1,95 @@
|
||||
import re
|
||||
import argparse
|
||||
import fitz
|
||||
import pytesseract
|
||||
import cv2
|
||||
import numpy as np
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from reportlab.pdfgen import canvas
|
||||
|
||||
|
||||
def redact_text(text, validators, ignored):
|
||||
redacted_text = text
|
||||
|
||||
for validator_type, patterns in validators.items():
|
||||
for pattern, replacement in patterns:
|
||||
redacted_text, _ = re.subn(pattern, replacement, redacted_text, flags=re.IGNORECASE)
|
||||
print(f"{validator_type}: {redacted_text}")
|
||||
|
||||
for pattern in ignored:
|
||||
redacted_text, _ = re.subn(pattern, '', redacted_text, flags=re.IGNORECASE)
|
||||
print(f"Ignored: {redacted_text}")
|
||||
|
||||
return redacted_text
|
||||
|
||||
|
||||
def process_pdf(input_file, output_file, validators, ignored):
|
||||
doc = fitz.open(input_file)
|
||||
|
||||
for page_index, page in enumerate(doc):
|
||||
pixmap = page.get_pixmap()
|
||||
|
||||
image_np = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
||||
image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
|
||||
extracted_text = pytesseract.image_to_string(image_rgb)
|
||||
redacted_text = redact_text(extracted_text, validators, ignored)
|
||||
|
||||
c = canvas.Canvas("temp.pdf", pagesize=letter)
|
||||
c.setPageSize((pixmap.w, pixmap.h))
|
||||
|
||||
c.setFont("Helvetica", 12)
|
||||
for line in redacted_text.split("\n"):
|
||||
if line.strip():
|
||||
c.setFillColorRGB(0, 0, 0)
|
||||
c.rect(0, pixmap.h - 16, pixmap.w, 16, fill=True)
|
||||
c.setFillColorRGB(1, 1, 1)
|
||||
c.drawString(6, pixmap.h - 11, line)
|
||||
|
||||
c.showPage()
|
||||
|
||||
c.save()
|
||||
|
||||
redacted_page = fitz.open("temp.pdf")
|
||||
page_text = redacted_page[0].get_text()
|
||||
|
||||
if page_text:
|
||||
page.insert_text((0, 0), page_text)
|
||||
|
||||
doc.save(output_file)
|
||||
doc.close()
|
||||
redacted_page.close()
|
||||
|
||||
print("Redacted PDF saved successfully!")
|
||||
|
||||
|
||||
# Define the validators and ignored patterns
|
||||
validators = {
|
||||
"Hong Kong Identity Card Number": [
|
||||
(r'\b[A-Z]\d{6}\([A-Z]\)', 'XXXXXXXXX'),
|
||||
(r'\b[A-Z]\d{6}\(\d\)', 'XXXXXXXXX')
|
||||
],
|
||||
"Phone Numbers": [
|
||||
(r'\+\d{1,3}[-\s]?\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', '+XXX XXXX XXXX'),
|
||||
(r'\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', 'XXXX XXXX')
|
||||
],
|
||||
"Credit Card Numbers": [
|
||||
(r'\b[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}\b', 'XXXX-XXXX-XXXX-XXXX')
|
||||
],
|
||||
"Email Addresses": [
|
||||
(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', 'XXXXX@XXXXX.XXX')
|
||||
],
|
||||
}
|
||||
|
||||
ignored = [
|
||||
r'0000[-\s]?0000[-\s]?0000[-\s]?0000',
|
||||
r'1111[-\s]?1111[-\s]?1111[-\s]?1111'
|
||||
]
|
||||
|
||||
# Parse command-line arguments
|
||||
parser = argparse.ArgumentParser(description='PDF Redaction Script')
|
||||
parser.add_argument('input_file', help='Path to the input PDF file')
|
||||
parser.add_argument('output_file', help='Path to the output PDF file')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Call the process_pdf function with input and output file paths, validators, and ignored patterns
|
||||
process_pdf(args.input_file, args.output_file, validators, ignored)
|
85
goodtastesmartie/task1-deal-broken/ocr/Process2.py
Normal file
85
goodtastesmartie/task1-deal-broken/ocr/Process2.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import re
|
||||
import argparse
|
||||
import fitz
|
||||
import pytesseract
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
def redact_text(text, validators, ignored):
|
||||
redacted_text = text
|
||||
|
||||
# Apply validators to redact sensitive information
|
||||
for validator_type, patterns in validators.items():
|
||||
for pattern, replacement in patterns:
|
||||
redacted_text = re.sub(pattern, replacement, redacted_text, flags=re.IGNORECASE)
|
||||
|
||||
# Apply ignored patterns to exclude certain patterns from redaction
|
||||
for pattern in ignored:
|
||||
redacted_text = re.sub(pattern, '', redacted_text, flags=re.IGNORECASE)
|
||||
|
||||
return redacted_text
|
||||
|
||||
|
||||
def process_pdf(input_file, output_file, validators, ignored):
|
||||
doc = fitz.open(input_file)
|
||||
|
||||
for page_index, page in enumerate(doc):
|
||||
pixmap = page.get_pixmap()
|
||||
|
||||
# Convert the pixmap to a NumPy array
|
||||
image_np = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
||||
|
||||
# Convert the image from BGR to RGB format
|
||||
image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
|
||||
|
||||
# Use Tesseract OCR to extract text from the image
|
||||
extracted_text = pytesseract.image_to_string(image_rgb)
|
||||
|
||||
# Apply redaction on the extracted text
|
||||
redacted_text = redact_text(extracted_text, validators, ignored)
|
||||
|
||||
# Print the extracted and redacted text for verification
|
||||
print(f"Page {page_index + 1} - Extracted Text:")
|
||||
print(extracted_text)
|
||||
print(f"Page {page_index + 1} - Redacted Text:")
|
||||
print(redacted_text)
|
||||
print("-----")
|
||||
|
||||
doc.save(output_file)
|
||||
doc.close()
|
||||
|
||||
print("Redacted PDF saved successfully!")
|
||||
|
||||
# Define the validators and ignored patterns
|
||||
validators = {
|
||||
|
||||
"Hong Kong Identity Card Number": [
|
||||
(r'\b[A-Z]\d{6}\([A-Z]\)', 'XXXXXXXXX'),
|
||||
(r'\b[A-Z]\d{6}\(\d\)', 'XXXXXXXXX')
|
||||
],
|
||||
"Phone Numbers": [
|
||||
(r'\+\d{1,3}[-\s]?\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', '+XXX XXXX XXXX'),
|
||||
(r'\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', 'XXXX XXXX')
|
||||
],
|
||||
"Credit Card Numbers": [
|
||||
(r'\b[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}\b', 'XXXX-XXXX-XXXX-XXXX')
|
||||
],
|
||||
"Email Addresses": [
|
||||
(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', 'XXXXX@XXXXX.XXX')
|
||||
],
|
||||
|
||||
}
|
||||
|
||||
ignored = [
|
||||
r'0000[-\s]?0000[-\s]?0000[-\s]?0000',
|
||||
r'1111[-\s]?1111[-\s]?1111[-\s]?1111'
|
||||
]
|
||||
|
||||
# Parse command-line arguments
|
||||
parser = argparse.ArgumentParser(description='PDF Redaction Script')
|
||||
parser.add_argument('input_file', help='Path to the input PDF file')
|
||||
parser.add_argument('output_file', help='Path to the output PDF file')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Call the process_pdf function with input and output file paths, validators, and ignored patterns
|
||||
process_pdf(args.input_file, args.output_file, validators, ignored)
|
BIN
goodtastesmartie/task1-deal-broken/ocr/Screenshot 2024-03-17 at 8.18.21 PM.png
(Stored with Git LFS)
Normal file
BIN
goodtastesmartie/task1-deal-broken/ocr/Screenshot 2024-03-17 at 8.18.21 PM.png
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
goodtastesmartie/task1-deal-broken/ocr/credit_card_PNG204.png
(Stored with Git LFS)
Normal file
BIN
goodtastesmartie/task1-deal-broken/ocr/credit_card_PNG204.png
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
goodtastesmartie/task1-deal-broken/ocr/docs/sample.png
(Stored with Git LFS)
Normal file
BIN
goodtastesmartie/task1-deal-broken/ocr/docs/sample.png
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
goodtastesmartie/task1-deal-broken/ocr/document.pdf
Normal file
BIN
goodtastesmartie/task1-deal-broken/ocr/document.pdf
Normal file
Binary file not shown.
BIN
goodtastesmartie/task1-deal-broken/ocr/documenttest1.docx
Normal file
BIN
goodtastesmartie/task1-deal-broken/ocr/documenttest1.docx
Normal file
Binary file not shown.
BIN
goodtastesmartie/task1-deal-broken/ocr/documenttest1result.docx
Normal file
BIN
goodtastesmartie/task1-deal-broken/ocr/documenttest1result.docx
Normal file
Binary file not shown.
BIN
goodtastesmartie/task1-deal-broken/ocr/pdftest.pdf
Normal file
BIN
goodtastesmartie/task1-deal-broken/ocr/pdftest.pdf
Normal file
Binary file not shown.
107
goodtastesmartie/task1-deal-broken/ocr/pythontransform.py
Normal file
107
goodtastesmartie/task1-deal-broken/ocr/pythontransform.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import sys
|
||||
import fitz # PyMuPDF
|
||||
from docx import Document
|
||||
import cv2
|
||||
import pytesseract
|
||||
import re
|
||||
|
||||
def redact_sensitive_info(text, validators, ignored):
|
||||
redacted_text = text
|
||||
for validator, regex_matches in validators.items():
|
||||
for regex, replacement in regex_matches:
|
||||
matches = re.finditer(regex, redacted_text)
|
||||
for match in matches:
|
||||
sensitive_info = match.group()
|
||||
if not any(re.search(ignore_regex, sensitive_info) for ignore_regex in ignored):
|
||||
redacted_text = redacted_text.replace(sensitive_info, replacement)
|
||||
return redacted_text
|
||||
|
||||
|
||||
def save_as_pdf(doc, output_file):
|
||||
try:
|
||||
doc.save(output_file)
|
||||
print("Redacted document saved as:", output_file)
|
||||
except Exception as e:
|
||||
print("Error saving PDF:", e)
|
||||
|
||||
def save_as_docx(doc, output_file):
|
||||
try:
|
||||
doc.save(output_file)
|
||||
print("Redacted document saved as:", output_file)
|
||||
except Exception as e:
|
||||
print("Error saving DOCX:", e)
|
||||
|
||||
def process_pdf(input_file, output_file, validators, ignored):
|
||||
doc = fitz.open(input_file)
|
||||
output_text = ""
|
||||
for page_number in range(len(doc)):
|
||||
page = doc.load_page(page_number)
|
||||
text = page.get_text()
|
||||
output_text += text + "\n" # Concatenate text from all pages
|
||||
redacted_text = redact_sensitive_info(output_text, validators, ignored)
|
||||
doc = Document()
|
||||
doc.add_paragraph(redacted_text)
|
||||
|
||||
if output_file.endswith('.pdf'):
|
||||
save_as_pdf(doc, output_file)
|
||||
elif output_file.endswith('.docx'):
|
||||
save_as_docx(doc, output_file)
|
||||
|
||||
def process_docx(input_file, output_file, validators, ignored):
|
||||
doc = Document(input_file)
|
||||
paragraphs = [paragraph.text for paragraph in doc.paragraphs]
|
||||
text = '\n'.join(paragraphs)
|
||||
redacted_text = redact_sensitive_info(text, validators, ignored)
|
||||
doc = Document()
|
||||
doc.add_paragraph(redacted_text)
|
||||
|
||||
if output_file.endswith('.pdf'):
|
||||
save_as_pdf(doc, output_file)
|
||||
elif output_file.endswith('.docx'):
|
||||
save_as_docx(doc, output_file)
|
||||
|
||||
def process_image(input_file, output_file, validators, ignored):
|
||||
image = cv2.imread(input_file)
|
||||
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
_, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
|
||||
text = pytesseract.image_to_string(binary)
|
||||
redacted_text = redact_sensitive_info(text, validators, ignored)
|
||||
doc = Document()
|
||||
doc.add_paragraph(redacted_text)
|
||||
|
||||
if output_file.endswith('.pdf'):
|
||||
save_as_pdf(doc, output_file)
|
||||
elif output_file.endswith('.docx'):
|
||||
save_as_docx(doc, output_file)
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python script.py <input_file> <output_file>")
|
||||
sys.exit(1)
|
||||
|
||||
input_file = sys.argv[1]
|
||||
output_file = sys.argv[2]
|
||||
|
||||
validators = {
|
||||
"Hong Kong Identity Card Number": [
|
||||
(r'\b[A-Z]\d{6}[\(][(A-Z0-9][\)]', 'XXXXXXXXX'),
|
||||
(r'\b[A-Z][A-Z]\d{6}\(\d\)', 'XXXXXXXXXX')
|
||||
],
|
||||
"Credit card number": [
|
||||
(r'\b(?:5[1-5]\d{2}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}|2(?:2[2-7]\d{2}|7[3-9]\d{2})[- ]?\d{4}[- ]?\d{4}[- ]?\d{4})\b', 'XXXX-XXXX-XXXX-XXXX')
|
||||
]
|
||||
}
|
||||
ignored = [
|
||||
r'0000([- ]?)0000([ -]?)0000([ -]?)0000',
|
||||
r'1111([- ]?)1111([ -]?)1111([ -]?)1111',
|
||||
]
|
||||
|
||||
if input_file.endswith('.pdf'):
|
||||
process_pdf(input_file, output_file, validators, ignored)
|
||||
elif input_file.endswith('.docx'):
|
||||
process_docx(input_file, output_file, validators, ignored)
|
||||
else:
|
||||
process_image(input_file, output_file, validators, ignored)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
38
goodtastesmartie/task1-deal-broken/ocr/readme.txt
Normal file
38
goodtastesmartie/task1-deal-broken/ocr/readme.txt
Normal file
@@ -0,0 +1,38 @@
|
||||
|
||||
# OCR Sensitive Information Redaction
|
||||
|
||||
This project is a Python script for redacting sensitive information from documents using Optical Character Recognition (OCR).
|
||||
It takes as input documents in various formats (PDF, DOCX, images) containing sensitive information such as credit card numbers and Hong Kong Identity Card numbers,
|
||||
and redacts this information before saving the redacted document in the desired format.
|
||||
|
||||
## Installation
|
||||
|
||||
1. Copy the pythontransform.py in your local machine.
|
||||
|
||||
2. Install the required Python libraries including `opencv-python`, `PyMuPDF` (for PDF processing), `python-docx` (for DOCX processing),
|
||||
and `pytesseract` (for OCR).
|
||||
|
||||
## Usage
|
||||
|
||||
To run the script, use the following command:
|
||||
|
||||
python pythontransform.py <input_file> <output_file>
|
||||
|
||||
Replace `<input_file>` with the path to the input document you want to redact, and `<output_file>` with the desired path for the redacted document.
|
||||
|
||||
For example:
|
||||
|
||||
python pythontransform.py input_document.pdf redacted_document.docx
|
||||
|
||||
This will redact sensitive information from the input PDF file `input_document.pdf` and save the redacted document as `redacted_document.docx`.
|
||||
|
||||
## Supported Formats
|
||||
|
||||
The script supports input documents in the following formats:
|
||||
- PDF
|
||||
- DOCX
|
||||
- Images (PNG, JPEG, etc.)
|
||||
|
||||
The output format for the redacted document is in DOCX format.
|
||||
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
goodtastesmartie/task1-deal-broken/ocr/result/pdftextresult.docx
Normal file
BIN
goodtastesmartie/task1-deal-broken/ocr/result/pdftextresult.docx
Normal file
Binary file not shown.
Reference in New Issue
Block a user