update,

2025-01-31 19:51:04 +08:00
parent 4c9568fd60
commit 90bb565f91
17 changed files with 372 additions and 0 deletions
--- a/goodtastesmartie/gitUpdate.bat
+++ b/goodtastesmartie/gitUpdate.bat
@@ -0,0 +1,7 @@
+git status .
+
+@pause
+
+git add .
+git commit -m"update goodtastesmartie,"
+start git push
--- a/goodtastesmartie/meta.md
+++ b/goodtastesmartie/meta.md
@@ -0,0 +1,31 @@
+---
+tags: python, contur-detection, credit-card, canny-detection
+---
+
+### task1 - deal broken
+
+- deal broken as sensitive information
+
+### samples
+
+https://universe.roboflow.com/search?q=credit%20card%20images%3E300
+
+No modification and i used to rewrite
+
+so, to solidify the requirements.
+
+Using python,
+what you want is accepting the image input containing an credit card
+
+return the:
+
+- card number and
+- "valid thru"(expiry date)
+
+on the vision given ?
+
+### bottom line
+
+- won't recognize or extract information from HKID card image
+
+and for this kind of image recognization, not guaranteed to have 100% accuracy as it's based on OCR
--- a/goodtastesmartie/task1-deal-broken/ocr/Pdfredaction.py
+++ b/goodtastesmartie/task1-deal-broken/ocr/Pdfredaction.py
@@ -0,0 +1,95 @@
+import re
+import argparse
+import fitz
+import pytesseract
+import cv2
+import numpy as np
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+
+
+def redact_text(text, validators, ignored):
+    redacted_text = text
+
+    for validator_type, patterns in validators.items():
+        for pattern, replacement in patterns:
+            redacted_text, _ = re.subn(pattern, replacement, redacted_text, flags=re.IGNORECASE)
+            print(f"{validator_type}: {redacted_text}")
+
+    for pattern in ignored:
+        redacted_text, _ = re.subn(pattern, '', redacted_text, flags=re.IGNORECASE)
+        print(f"Ignored: {redacted_text}")
+
+    return redacted_text
+
+
+def process_pdf(input_file, output_file, validators, ignored):
+    doc = fitz.open(input_file)
+
+    for page_index, page in enumerate(doc):
+        pixmap = page.get_pixmap()
+
+        image_np = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
+        image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+        extracted_text = pytesseract.image_to_string(image_rgb)
+        redacted_text = redact_text(extracted_text, validators, ignored)
+
+        c = canvas.Canvas("temp.pdf", pagesize=letter)
+        c.setPageSize((pixmap.w, pixmap.h))
+
+        c.setFont("Helvetica", 12)
+        for line in redacted_text.split("\n"):
+            if line.strip():
+                c.setFillColorRGB(0, 0, 0)
+                c.rect(0, pixmap.h - 16, pixmap.w, 16, fill=True)
+                c.setFillColorRGB(1, 1, 1)
+                c.drawString(6, pixmap.h - 11, line)
+
+            c.showPage()
+
+        c.save()
+
+        redacted_page = fitz.open("temp.pdf")
+        page_text = redacted_page[0].get_text()
+
+        if page_text:
+            page.insert_text((0, 0), page_text)
+
+    doc.save(output_file)
+    doc.close()
+    redacted_page.close()
+
+    print("Redacted PDF saved successfully!")
+
+
+# Define the validators and ignored patterns
+validators = {
+    "Hong Kong Identity Card Number": [
+        (r'\b[A-Z]\d{6}\([A-Z]\)', 'XXXXXXXXX'),
+        (r'\b[A-Z]\d{6}\(\d\)', 'XXXXXXXXX')
+    ],
+    "Phone Numbers": [
+        (r'\+\d{1,3}[-\s]?\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', '+XXX XXXX XXXX'),
+        (r'\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', 'XXXX XXXX')
+    ],
+    "Credit Card Numbers": [
+        (r'\b[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}\b', 'XXXX-XXXX-XXXX-XXXX')
+    ],
+    "Email Addresses": [
+        (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', 'XXXXX@XXXXX.XXX')
+    ],
+}
+
+ignored = [
+    r'0000[-\s]?0000[-\s]?0000[-\s]?0000',
+    r'1111[-\s]?1111[-\s]?1111[-\s]?1111'
+]
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description='PDF Redaction Script')
+parser.add_argument('input_file', help='Path to the input PDF file')
+parser.add_argument('output_file', help='Path to the output PDF file')
+args = parser.parse_args()
+
+# Call the process_pdf function with input and output file paths, validators, and ignored patterns
+process_pdf(args.input_file, args.output_file, validators, ignored)
--- a/goodtastesmartie/task1-deal-broken/ocr/Process2.py
+++ b/goodtastesmartie/task1-deal-broken/ocr/Process2.py
@@ -0,0 +1,85 @@
+import re
+import argparse
+import fitz
+import pytesseract
+import cv2
+import numpy as np
+
+def redact_text(text, validators, ignored):
+    redacted_text = text
+
+    # Apply validators to redact sensitive information
+    for validator_type, patterns in validators.items():
+        for pattern, replacement in patterns:
+            redacted_text = re.sub(pattern, replacement, redacted_text, flags=re.IGNORECASE)
+
+    # Apply ignored patterns to exclude certain patterns from redaction
+    for pattern in ignored:
+        redacted_text = re.sub(pattern, '', redacted_text, flags=re.IGNORECASE)
+
+    return redacted_text
+
+
+def process_pdf(input_file, output_file, validators, ignored):
+    doc = fitz.open(input_file)
+
+    for page_index, page in enumerate(doc):
+        pixmap = page.get_pixmap()
+
+        # Convert the pixmap to a NumPy array
+        image_np = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
+
+        # Convert the image from BGR to RGB format
+        image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+
+        # Use Tesseract OCR to extract text from the image
+        extracted_text = pytesseract.image_to_string(image_rgb)
+
+        # Apply redaction on the extracted text
+        redacted_text = redact_text(extracted_text, validators, ignored)
+
+        # Print the extracted and redacted text for verification
+        print(f"Page {page_index + 1} - Extracted Text:")
+        print(extracted_text)
+        print(f"Page {page_index + 1} - Redacted Text:")
+        print(redacted_text)
+        print("-----")
+
+    doc.save(output_file)
+    doc.close()
+
+    print("Redacted PDF saved successfully!")
+
+# Define the validators and ignored patterns
+validators = {
+
+    "Hong Kong Identity Card Number": [
+        (r'\b[A-Z]\d{6}\([A-Z]\)', 'XXXXXXXXX'),
+        (r'\b[A-Z]\d{6}\(\d\)', 'XXXXXXXXX')
+    ],
+    "Phone Numbers": [
+        (r'\+\d{1,3}[-\s]?\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', '+XXX XXXX XXXX'),
+        (r'\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', 'XXXX XXXX')
+    ],
+      "Credit Card Numbers": [
+        (r'\b[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}\b', 'XXXX-XXXX-XXXX-XXXX')
+    ],
+    "Email Addresses": [
+        (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', 'XXXXX@XXXXX.XXX')
+    ],
+
+}
+
+ignored = [
+    r'0000[-\s]?0000[-\s]?0000[-\s]?0000',
+    r'1111[-\s]?1111[-\s]?1111[-\s]?1111'
+]
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description='PDF Redaction Script')
+parser.add_argument('input_file', help='Path to the input PDF file')
+parser.add_argument('output_file', help='Path to the output PDF file')
+args = parser.parse_args()
+
+# Call the process_pdf function with input and output file paths, validators, and ignored patterns
+process_pdf(args.input_file, args.output_file, validators, ignored)
--- a/goodtastesmartie/task1-deal-broken/ocr/Screenshot
+++ b/goodtastesmartie/task1-deal-broken/ocr/Screenshot
--- a/goodtastesmartie/task1-deal-broken/ocr/credit_card_PNG204.png
+++ b/goodtastesmartie/task1-deal-broken/ocr/credit_card_PNG204.png
--- a/goodtastesmartie/task1-deal-broken/ocr/docs/sample.png
+++ b/goodtastesmartie/task1-deal-broken/ocr/docs/sample.png
--- a/goodtastesmartie/task1-deal-broken/ocr/document.pdf
+++ b/goodtastesmartie/task1-deal-broken/ocr/document.pdf
--- a/goodtastesmartie/task1-deal-broken/ocr/documenttest1.docx
+++ b/goodtastesmartie/task1-deal-broken/ocr/documenttest1.docx
--- a/goodtastesmartie/task1-deal-broken/ocr/documenttest1result.docx
+++ b/goodtastesmartie/task1-deal-broken/ocr/documenttest1result.docx
--- a/goodtastesmartie/task1-deal-broken/ocr/pdftest.pdf
+++ b/goodtastesmartie/task1-deal-broken/ocr/pdftest.pdf
--- a/goodtastesmartie/task1-deal-broken/ocr/pythontransform.py
+++ b/goodtastesmartie/task1-deal-broken/ocr/pythontransform.py
@@ -0,0 +1,107 @@
+import sys
+import fitz  # PyMuPDF
+from docx import Document
+import cv2
+import pytesseract
+import re
+
+def redact_sensitive_info(text, validators, ignored):
+    redacted_text = text
+    for validator, regex_matches in validators.items():
+        for regex, replacement in regex_matches:
+            matches = re.finditer(regex, redacted_text)
+            for match in matches:
+                sensitive_info = match.group()
+                if not any(re.search(ignore_regex, sensitive_info) for ignore_regex in ignored):
+                    redacted_text = redacted_text.replace(sensitive_info, replacement)
+    return redacted_text
+
+
+def save_as_pdf(doc, output_file):
+    try:
+        doc.save(output_file)
+        print("Redacted document saved as:", output_file)
+    except Exception as e:
+        print("Error saving PDF:", e)
+
+def save_as_docx(doc, output_file):
+    try:
+        doc.save(output_file)
+        print("Redacted document saved as:", output_file)
+    except Exception as e:
+        print("Error saving DOCX:", e)
+
+def process_pdf(input_file, output_file, validators, ignored):
+    doc = fitz.open(input_file)
+    output_text = ""
+    for page_number in range(len(doc)):
+        page = doc.load_page(page_number)
+        text = page.get_text()
+        output_text += text + "\n"  # Concatenate text from all pages
+    redacted_text = redact_sensitive_info(output_text, validators, ignored)
+    doc = Document()
+    doc.add_paragraph(redacted_text)
+    
+    if output_file.endswith('.pdf'):
+        save_as_pdf(doc, output_file)
+    elif output_file.endswith('.docx'):
+        save_as_docx(doc, output_file)
+
+def process_docx(input_file, output_file, validators, ignored):
+    doc = Document(input_file)
+    paragraphs = [paragraph.text for paragraph in doc.paragraphs]
+    text = '\n'.join(paragraphs)
+    redacted_text = redact_sensitive_info(text, validators, ignored)
+    doc = Document()
+    doc.add_paragraph(redacted_text)
+    
+    if output_file.endswith('.pdf'):
+        save_as_pdf(doc, output_file)
+    elif output_file.endswith('.docx'):
+        save_as_docx(doc, output_file)
+
+def process_image(input_file, output_file, validators, ignored):
+    image = cv2.imread(input_file)
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
+    text = pytesseract.image_to_string(binary)
+    redacted_text = redact_sensitive_info(text, validators, ignored)
+    doc = Document()
+    doc.add_paragraph(redacted_text)
+    
+    if output_file.endswith('.pdf'):
+        save_as_pdf(doc, output_file)
+    elif output_file.endswith('.docx'):
+        save_as_docx(doc, output_file)
+
+def main():
+    if len(sys.argv) != 3:
+        print("Usage: python script.py <input_file> <output_file>")
+        sys.exit(1)
+    
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+    
+    validators = {
+        "Hong Kong Identity Card Number": [
+            (r'\b[A-Z]\d{6}[\(][(A-Z0-9][\)]', 'XXXXXXXXX'),
+            (r'\b[A-Z][A-Z]\d{6}\(\d\)', 'XXXXXXXXXX')
+        ],
+        "Credit card number": [
+            (r'\b(?:5[1-5]\d{2}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}|2(?:2[2-7]\d{2}|7[3-9]\d{2})[- ]?\d{4}[- ]?\d{4}[- ]?\d{4})\b', 'XXXX-XXXX-XXXX-XXXX')
+        ]
+    }
+    ignored = [
+        r'0000([- ]?)0000([ -]?)0000([ -]?)0000',
+        r'1111([- ]?)1111([ -]?)1111([ -]?)1111',
+    ]
+    
+    if input_file.endswith('.pdf'):
+        process_pdf(input_file, output_file, validators, ignored)
+    elif input_file.endswith('.docx'):
+        process_docx(input_file, output_file, validators, ignored)
+    else:
+        process_image(input_file, output_file, validators, ignored)
+
+if __name__ == "__main__":
+    main()
--- a/goodtastesmartie/task1-deal-broken/ocr/readme.txt
+++ b/goodtastesmartie/task1-deal-broken/ocr/readme.txt
@@ -0,0 +1,38 @@
+
+# OCR Sensitive Information Redaction
+
+This project is a Python script for redacting sensitive information from documents using Optical Character Recognition (OCR).
+It takes as input documents in various formats (PDF, DOCX, images) containing sensitive information such as credit card numbers and Hong Kong Identity Card numbers,
+and redacts this information before saving the redacted document in the desired format.
+
+## Installation
+
+1. Copy the pythontransform.py in your local machine.
+
+2. Install the required Python libraries including `opencv-python`, `PyMuPDF` (for PDF processing), `python-docx` (for DOCX processing),
+and `pytesseract` (for OCR).
+
+## Usage
+
+To run the script, use the following command:
+
+python pythontransform.py <input_file> <output_file>
+
+Replace `<input_file>` with the path to the input document you want to redact, and `<output_file>` with the desired path for the redacted document.
+
+For example:
+
+python pythontransform.py input_document.pdf redacted_document.docx
+
+This will redact sensitive information from the input PDF file `input_document.pdf` and save the redacted document as `redacted_document.docx`.
+
+## Supported Formats
+
+The script supports input documents in the following formats:
+- PDF
+- DOCX
+- Images (PNG, JPEG, etc.)
+
+The output format for the redacted document is in DOCX format.
+
+
--- a/goodtastesmartie/task1-deal-broken/ocr/result/Screenshot1result.docx
+++ b/goodtastesmartie/task1-deal-broken/ocr/result/Screenshot1result.docx
--- a/goodtastesmartie/task1-deal-broken/ocr/result/documentresult.docx
+++ b/goodtastesmartie/task1-deal-broken/ocr/result/documentresult.docx
--- a/goodtastesmartie/task1-deal-broken/ocr/result/documenttest1result.docx
+++ b/goodtastesmartie/task1-deal-broken/ocr/result/documenttest1result.docx
--- a/goodtastesmartie/task1-deal-broken/ocr/result/pdftextresult.docx
+++ b/goodtastesmartie/task1-deal-broken/ocr/result/pdftextresult.docx