diff --git a/goodtastesmartie/gitUpdate.bat b/goodtastesmartie/gitUpdate.bat new file mode 100644 index 00000000..ca75dafe --- /dev/null +++ b/goodtastesmartie/gitUpdate.bat @@ -0,0 +1,7 @@ +git status . + +@pause + +git add . +git commit -m"update goodtastesmartie," +start git push \ No newline at end of file diff --git a/goodtastesmartie/meta.md b/goodtastesmartie/meta.md new file mode 100644 index 00000000..160fb112 --- /dev/null +++ b/goodtastesmartie/meta.md @@ -0,0 +1,31 @@ +--- +tags: python, contur-detection, credit-card, canny-detection +--- + +### task1 - deal broken + +- deal broken as sensitive information + +### samples + +https://universe.roboflow.com/search?q=credit%20card%20images%3E300 + +No modification and i used to rewrite + +so, to solidify the requirements. + +Using python, +what you want is accepting the image input containing an credit card + +return the: + +- card number and +- "valid thru"(expiry date) + +on the vision given ? + +### bottom line + +- won't recognize or extract information from HKID card image + +and for this kind of image recognization, not guaranteed to have 100% accuracy as it's based on OCR diff --git a/goodtastesmartie/task1-deal-broken/ocr/Pdfredaction.py b/goodtastesmartie/task1-deal-broken/ocr/Pdfredaction.py new file mode 100644 index 00000000..0047bc8a --- /dev/null +++ b/goodtastesmartie/task1-deal-broken/ocr/Pdfredaction.py @@ -0,0 +1,95 @@ +import re +import argparse +import fitz +import pytesseract +import cv2 +import numpy as np +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas + + +def redact_text(text, validators, ignored): + redacted_text = text + + for validator_type, patterns in validators.items(): + for pattern, replacement in patterns: + redacted_text, _ = re.subn(pattern, replacement, redacted_text, flags=re.IGNORECASE) + print(f"{validator_type}: {redacted_text}") + + for pattern in ignored: + redacted_text, _ = re.subn(pattern, '', redacted_text, flags=re.IGNORECASE) + print(f"Ignored: {redacted_text}") + + return redacted_text + + +def process_pdf(input_file, output_file, validators, ignored): + doc = fitz.open(input_file) + + for page_index, page in enumerate(doc): + pixmap = page.get_pixmap() + + image_np = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) + image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB) + extracted_text = pytesseract.image_to_string(image_rgb) + redacted_text = redact_text(extracted_text, validators, ignored) + + c = canvas.Canvas("temp.pdf", pagesize=letter) + c.setPageSize((pixmap.w, pixmap.h)) + + c.setFont("Helvetica", 12) + for line in redacted_text.split("\n"): + if line.strip(): + c.setFillColorRGB(0, 0, 0) + c.rect(0, pixmap.h - 16, pixmap.w, 16, fill=True) + c.setFillColorRGB(1, 1, 1) + c.drawString(6, pixmap.h - 11, line) + + c.showPage() + + c.save() + + redacted_page = fitz.open("temp.pdf") + page_text = redacted_page[0].get_text() + + if page_text: + page.insert_text((0, 0), page_text) + + doc.save(output_file) + doc.close() + redacted_page.close() + + print("Redacted PDF saved successfully!") + + +# Define the validators and ignored patterns +validators = { + "Hong Kong Identity Card Number": [ + (r'\b[A-Z]\d{6}\([A-Z]\)', 'XXXXXXXXX'), + (r'\b[A-Z]\d{6}\(\d\)', 'XXXXXXXXX') + ], + "Phone Numbers": [ + (r'\+\d{1,3}[-\s]?\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', '+XXX XXXX XXXX'), + (r'\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', 'XXXX XXXX') + ], + "Credit Card Numbers": [ + (r'\b[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}\b', 'XXXX-XXXX-XXXX-XXXX') + ], + "Email Addresses": [ + (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', 'XXXXX@XXXXX.XXX') + ], +} + +ignored = [ + r'0000[-\s]?0000[-\s]?0000[-\s]?0000', + r'1111[-\s]?1111[-\s]?1111[-\s]?1111' +] + +# Parse command-line arguments +parser = argparse.ArgumentParser(description='PDF Redaction Script') +parser.add_argument('input_file', help='Path to the input PDF file') +parser.add_argument('output_file', help='Path to the output PDF file') +args = parser.parse_args() + +# Call the process_pdf function with input and output file paths, validators, and ignored patterns +process_pdf(args.input_file, args.output_file, validators, ignored) \ No newline at end of file diff --git a/goodtastesmartie/task1-deal-broken/ocr/Process2.py b/goodtastesmartie/task1-deal-broken/ocr/Process2.py new file mode 100644 index 00000000..afa59203 --- /dev/null +++ b/goodtastesmartie/task1-deal-broken/ocr/Process2.py @@ -0,0 +1,85 @@ +import re +import argparse +import fitz +import pytesseract +import cv2 +import numpy as np + +def redact_text(text, validators, ignored): + redacted_text = text + + # Apply validators to redact sensitive information + for validator_type, patterns in validators.items(): + for pattern, replacement in patterns: + redacted_text = re.sub(pattern, replacement, redacted_text, flags=re.IGNORECASE) + + # Apply ignored patterns to exclude certain patterns from redaction + for pattern in ignored: + redacted_text = re.sub(pattern, '', redacted_text, flags=re.IGNORECASE) + + return redacted_text + + +def process_pdf(input_file, output_file, validators, ignored): + doc = fitz.open(input_file) + + for page_index, page in enumerate(doc): + pixmap = page.get_pixmap() + + # Convert the pixmap to a NumPy array + image_np = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) + + # Convert the image from BGR to RGB format + image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB) + + # Use Tesseract OCR to extract text from the image + extracted_text = pytesseract.image_to_string(image_rgb) + + # Apply redaction on the extracted text + redacted_text = redact_text(extracted_text, validators, ignored) + + # Print the extracted and redacted text for verification + print(f"Page {page_index + 1} - Extracted Text:") + print(extracted_text) + print(f"Page {page_index + 1} - Redacted Text:") + print(redacted_text) + print("-----") + + doc.save(output_file) + doc.close() + + print("Redacted PDF saved successfully!") + +# Define the validators and ignored patterns +validators = { + + "Hong Kong Identity Card Number": [ + (r'\b[A-Z]\d{6}\([A-Z]\)', 'XXXXXXXXX'), + (r'\b[A-Z]\d{6}\(\d\)', 'XXXXXXXXX') + ], + "Phone Numbers": [ + (r'\+\d{1,3}[-\s]?\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', '+XXX XXXX XXXX'), + (r'\d{1,4}[-\s]?\d{1,4}[-\s]?\d{1,9}', 'XXXX XXXX') + ], + "Credit Card Numbers": [ + (r'\b[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}\b', 'XXXX-XXXX-XXXX-XXXX') + ], + "Email Addresses": [ + (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', 'XXXXX@XXXXX.XXX') + ], + +} + +ignored = [ + r'0000[-\s]?0000[-\s]?0000[-\s]?0000', + r'1111[-\s]?1111[-\s]?1111[-\s]?1111' +] + +# Parse command-line arguments +parser = argparse.ArgumentParser(description='PDF Redaction Script') +parser.add_argument('input_file', help='Path to the input PDF file') +parser.add_argument('output_file', help='Path to the output PDF file') +args = parser.parse_args() + +# Call the process_pdf function with input and output file paths, validators, and ignored patterns +process_pdf(args.input_file, args.output_file, validators, ignored) diff --git a/goodtastesmartie/task1-deal-broken/ocr/Screenshot 2024-03-17 at 8.18.21 PM.png b/goodtastesmartie/task1-deal-broken/ocr/Screenshot 2024-03-17 at 8.18.21 PM.png new file mode 100644 index 00000000..466f61b4 --- /dev/null +++ b/goodtastesmartie/task1-deal-broken/ocr/Screenshot 2024-03-17 at 8.18.21 PM.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ce93304ad5ad511a6c5d8dafca6e1ba6a006edf1b3e4df9d87276e5afbd168a +size 146100 diff --git a/goodtastesmartie/task1-deal-broken/ocr/credit_card_PNG204.png b/goodtastesmartie/task1-deal-broken/ocr/credit_card_PNG204.png new file mode 100644 index 00000000..95f3ebd4 --- /dev/null +++ b/goodtastesmartie/task1-deal-broken/ocr/credit_card_PNG204.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7eef1cf8085bdb8615870335d46919b16900beb047cd7b3b1201db0ba16d524 +size 604127 diff --git a/goodtastesmartie/task1-deal-broken/ocr/docs/sample.png b/goodtastesmartie/task1-deal-broken/ocr/docs/sample.png new file mode 100644 index 00000000..4344ca3d --- /dev/null +++ b/goodtastesmartie/task1-deal-broken/ocr/docs/sample.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dd879860829014559bfec7202bf1366b818ea24f502511b3f1ef71c55b82a51 +size 1288782 diff --git a/goodtastesmartie/task1-deal-broken/ocr/document.pdf b/goodtastesmartie/task1-deal-broken/ocr/document.pdf new file mode 100644 index 00000000..8844ec66 Binary files /dev/null and b/goodtastesmartie/task1-deal-broken/ocr/document.pdf differ diff --git a/goodtastesmartie/task1-deal-broken/ocr/documenttest1.docx b/goodtastesmartie/task1-deal-broken/ocr/documenttest1.docx new file mode 100644 index 00000000..050c18f4 Binary files /dev/null and b/goodtastesmartie/task1-deal-broken/ocr/documenttest1.docx differ diff --git a/goodtastesmartie/task1-deal-broken/ocr/documenttest1result.docx b/goodtastesmartie/task1-deal-broken/ocr/documenttest1result.docx new file mode 100644 index 00000000..7c39c602 Binary files /dev/null and b/goodtastesmartie/task1-deal-broken/ocr/documenttest1result.docx differ diff --git a/goodtastesmartie/task1-deal-broken/ocr/pdftest.pdf b/goodtastesmartie/task1-deal-broken/ocr/pdftest.pdf new file mode 100644 index 00000000..ffbf7eae Binary files /dev/null and b/goodtastesmartie/task1-deal-broken/ocr/pdftest.pdf differ diff --git a/goodtastesmartie/task1-deal-broken/ocr/pythontransform.py b/goodtastesmartie/task1-deal-broken/ocr/pythontransform.py new file mode 100644 index 00000000..cf3bb3ab --- /dev/null +++ b/goodtastesmartie/task1-deal-broken/ocr/pythontransform.py @@ -0,0 +1,107 @@ +import sys +import fitz # PyMuPDF +from docx import Document +import cv2 +import pytesseract +import re + +def redact_sensitive_info(text, validators, ignored): + redacted_text = text + for validator, regex_matches in validators.items(): + for regex, replacement in regex_matches: + matches = re.finditer(regex, redacted_text) + for match in matches: + sensitive_info = match.group() + if not any(re.search(ignore_regex, sensitive_info) for ignore_regex in ignored): + redacted_text = redacted_text.replace(sensitive_info, replacement) + return redacted_text + + +def save_as_pdf(doc, output_file): + try: + doc.save(output_file) + print("Redacted document saved as:", output_file) + except Exception as e: + print("Error saving PDF:", e) + +def save_as_docx(doc, output_file): + try: + doc.save(output_file) + print("Redacted document saved as:", output_file) + except Exception as e: + print("Error saving DOCX:", e) + +def process_pdf(input_file, output_file, validators, ignored): + doc = fitz.open(input_file) + output_text = "" + for page_number in range(len(doc)): + page = doc.load_page(page_number) + text = page.get_text() + output_text += text + "\n" # Concatenate text from all pages + redacted_text = redact_sensitive_info(output_text, validators, ignored) + doc = Document() + doc.add_paragraph(redacted_text) + + if output_file.endswith('.pdf'): + save_as_pdf(doc, output_file) + elif output_file.endswith('.docx'): + save_as_docx(doc, output_file) + +def process_docx(input_file, output_file, validators, ignored): + doc = Document(input_file) + paragraphs = [paragraph.text for paragraph in doc.paragraphs] + text = '\n'.join(paragraphs) + redacted_text = redact_sensitive_info(text, validators, ignored) + doc = Document() + doc.add_paragraph(redacted_text) + + if output_file.endswith('.pdf'): + save_as_pdf(doc, output_file) + elif output_file.endswith('.docx'): + save_as_docx(doc, output_file) + +def process_image(input_file, output_file, validators, ignored): + image = cv2.imread(input_file) + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY) + text = pytesseract.image_to_string(binary) + redacted_text = redact_sensitive_info(text, validators, ignored) + doc = Document() + doc.add_paragraph(redacted_text) + + if output_file.endswith('.pdf'): + save_as_pdf(doc, output_file) + elif output_file.endswith('.docx'): + save_as_docx(doc, output_file) + +def main(): + if len(sys.argv) != 3: + print("Usage: python script.py ") + sys.exit(1) + + input_file = sys.argv[1] + output_file = sys.argv[2] + + validators = { + "Hong Kong Identity Card Number": [ + (r'\b[A-Z]\d{6}[\(][(A-Z0-9][\)]', 'XXXXXXXXX'), + (r'\b[A-Z][A-Z]\d{6}\(\d\)', 'XXXXXXXXXX') + ], + "Credit card number": [ + (r'\b(?:5[1-5]\d{2}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}|2(?:2[2-7]\d{2}|7[3-9]\d{2})[- ]?\d{4}[- ]?\d{4}[- ]?\d{4})\b', 'XXXX-XXXX-XXXX-XXXX') + ] + } + ignored = [ + r'0000([- ]?)0000([ -]?)0000([ -]?)0000', + r'1111([- ]?)1111([ -]?)1111([ -]?)1111', + ] + + if input_file.endswith('.pdf'): + process_pdf(input_file, output_file, validators, ignored) + elif input_file.endswith('.docx'): + process_docx(input_file, output_file, validators, ignored) + else: + process_image(input_file, output_file, validators, ignored) + +if __name__ == "__main__": + main() diff --git a/goodtastesmartie/task1-deal-broken/ocr/readme.txt b/goodtastesmartie/task1-deal-broken/ocr/readme.txt new file mode 100644 index 00000000..0a68fda9 --- /dev/null +++ b/goodtastesmartie/task1-deal-broken/ocr/readme.txt @@ -0,0 +1,38 @@ + +# OCR Sensitive Information Redaction + +This project is a Python script for redacting sensitive information from documents using Optical Character Recognition (OCR). +It takes as input documents in various formats (PDF, DOCX, images) containing sensitive information such as credit card numbers and Hong Kong Identity Card numbers, +and redacts this information before saving the redacted document in the desired format. + +## Installation + +1. Copy the pythontransform.py in your local machine. + +2. Install the required Python libraries including `opencv-python`, `PyMuPDF` (for PDF processing), `python-docx` (for DOCX processing), +and `pytesseract` (for OCR). + +## Usage + +To run the script, use the following command: + +python pythontransform.py + +Replace `` with the path to the input document you want to redact, and `` with the desired path for the redacted document. + +For example: + +python pythontransform.py input_document.pdf redacted_document.docx + +This will redact sensitive information from the input PDF file `input_document.pdf` and save the redacted document as `redacted_document.docx`. + +## Supported Formats + +The script supports input documents in the following formats: +- PDF +- DOCX +- Images (PNG, JPEG, etc.) + +The output format for the redacted document is in DOCX format. + + diff --git a/goodtastesmartie/task1-deal-broken/ocr/result/Screenshot1result.docx b/goodtastesmartie/task1-deal-broken/ocr/result/Screenshot1result.docx new file mode 100644 index 00000000..6821a16f Binary files /dev/null and b/goodtastesmartie/task1-deal-broken/ocr/result/Screenshot1result.docx differ diff --git a/goodtastesmartie/task1-deal-broken/ocr/result/documentresult.docx b/goodtastesmartie/task1-deal-broken/ocr/result/documentresult.docx new file mode 100644 index 00000000..18a17117 Binary files /dev/null and b/goodtastesmartie/task1-deal-broken/ocr/result/documentresult.docx differ diff --git a/goodtastesmartie/task1-deal-broken/ocr/result/documenttest1result.docx b/goodtastesmartie/task1-deal-broken/ocr/result/documenttest1result.docx new file mode 100644 index 00000000..4e9a74b6 Binary files /dev/null and b/goodtastesmartie/task1-deal-broken/ocr/result/documenttest1result.docx differ diff --git a/goodtastesmartie/task1-deal-broken/ocr/result/pdftextresult.docx b/goodtastesmartie/task1-deal-broken/ocr/result/pdftextresult.docx new file mode 100644 index 00000000..9b163766 Binary files /dev/null and b/goodtastesmartie/task1-deal-broken/ocr/result/pdftextresult.docx differ