update,

2025-01-31 19:28:21 +08:00
parent ce9a4aa9b3
commit 72bacdd6b5
168 changed files with 939668 additions and 0 deletions
--- a/banson_hker/phase1-fix/doc/decrypt.md
+++ b/banson_hker/phase1-fix/doc/decrypt.md
@@ -0,0 +1,133 @@
+### decrypt file
+
+### 5.1 general flow (decrypt_file)
+
+```python
+def decrypt_file(file_path, dictionary):
+    # will open an encrypted file and decrypt it by a guessed key
+    # 
+    # try to guess the k by e(as specified) first
+    #     PASS: show user decrypted
+    #     FAIL: process below
+    # try to bruce force the k by all possible k's candiates
+    #     PASS: show user decrypted
+    #     FAIL: show user cannot decrypt message
+
+    with open(file_path, 'r') as fi:
+        # beginning of the process
+        # read file and join the lines all
+        lines = fi.readlines()
+        encrypted_text = ''.join(lines)
+
+        decrypted = False
+        done = False
+        decrypted_text = ''
+
+        print("try decrypt by guessing maximum occurrence ... ")
+        [valid, text] = decrypt_by_letter_occurrence(encrypted_text, dictionary)
+        decrypted = valid
+        decrypted_text = text
+
+        # if the message cannot decrypt by letter e population
+        if not (decrypted):
+            print("decrypt by guessing maximum occurence seems doesn't work...")
+            [valid, text] = decrypt_by_bruce_force(encrypted_text, dictionary)
+            decrypted = valid
+            decrypted_text = text
+
+        if (decrypted):
+            print()
+            print("Final decrypted message:")
+            print()
+            print(decrypted_text)
+            print()
+
+        else:
+            # no decryption works
+            print("Seems neither of them works.")
+```
+
+![](../deliver/diagram/decrypt_file.png)
+
+
+### 5.2 (Decrypt by letter occurrence, decrypt_by_letter_occurrence)
+
+```python
+def decrypt_by_letter_occurrence(enc_text, dictionary):
+    # 1. get the occurrence/population of letter from whole encrypted message
+    # 2. find the max occurrence
+    # 3. find the distance between max letter and letter "E"  (denoted: "guessed k")
+    # 4. try decrypt using "guessed k"
+    # 5. lookup in dictionary (check_words_valid) and check if the decrypted valid.
+
+    print('decrypted by guessed k')
+    characters_population = count_most_occurrence_letter(enc_text)
+
+    print('')
+    print('population of letters in encrypted text (case insensitive, from a to z)')
+    print([chr(65 + i) for i in range(0, 26)])
+    print(['{:0>1}'.format(i) for i in characters_population])
+
+    print('')
+    guess_k = find_max_occurrence(characters_population)
+    print(f'try decrypt using guess_k -> guessed k: {guess_k}')
+
+    decrypted_text = shift_cipher_decrypt(enc_text, guess_k)
+    list_texts = decrypted_text.split(' ')
+    check_result_using_guess_k = check_words_valid(list_texts, dictionary, 0.8)
+
+    return [check_result_using_guess_k, decrypted_text]
+```
+
+![](../deliver/diagram/decrypt_by_letter_occurrence.png)
+
+### 5.3 (decrypt_by_bruce_force, decrypt_by_bruce_force)
+
+```python
+def decrypt_by_bruce_force(encrypted_text, dictionary):
+    # 1. get the occurrence/population of letter from whole encrypted message
+    # 2. find the candidates of k (filter all zero answer in step 1) 
+    
+    # 3. find the distance between max letter and letter "E" -> "guessed k"
+    # 4. try decrypt using this "guessed k"
+    # 5. lookup in dictionary (check_words_valid) and check if the decrypted valid.
+
+    print()
+    print('try decrypt by bruce forcing k ...')
+    # will open an encrypted file and decrypt it by a guessed key
+    dictionary_match_found = False
+    characters_population = count_most_occurrence_letter(encrypted_text)
+
+    guess_k = bruce_force_k(characters_population, encrypted_text, dictionary)
+    # guess_k == -1 means the decrypted message failed in dictionary lookup, 
+    # send the result directly
+    if (guess_k == -999):
+        return [False, '']
+
+    decrypted_text = shift_cipher_decrypt(encrypted_text, guess_k)
+    check_result_using_guess_k = check_words_valid(decrypted_text, dictionary, 0.8)
+
+    return [check_result_using_guess_k, decrypted_text]
+```
+
+![](../deliver/diagram/decrypt_by_bruce_force.png)
+
+
+
+### 5.3 (check_words_valid, check_words_valid)
+
+```python
+def check_words_valid(list_decrypted_text, dictionary, passing_gate):
+    # split decrypted text and word-by-word lookup in dictionary
+    # get a score reflect the matching 
+    # output true / false when score higher than the passing gate
+
+    result = list(map(lambda x: dictionary_lookup(
+        x, dictionary), list_decrypted_text))
+    len_all_result = len(result)
+    true_in_result = len(list(filter(lambda r: r, result)))
+
+    return true_in_result / len_all_result > passing_gate
+```
+
+![](../deliver/diagram/check_words_valid.png)
--- a/banson_hker/phase1-fix/doc/index.md
+++ b/banson_hker/phase1-fix/doc/index.md
@@ -0,0 +1,25 @@
+Sure! Here are some ideas related to the topics you mentioned:
+
+- Data structure: A data structure refers to how data is organized and stored in computer memory. It provides a way to efficiently access and manipulate the data. Examples of common data structures include arrays, linked lists, stacks, queues, trees, graphs, and hash tables.
+
+- Variable declaration and initialization: In programming languages like Python or C++, variables need to be declared before they can be used. Declaration involves specifying the type of variable (e.g., int for integer) and giving it a name. Initialization is assigning an initial value to a variable at the time of declaration.
+
+- Data collection, input, and validation: Data collection involves gathering relevant information from various sources or inputs such as user input via keyboard or files. Input validation ensures that the provided data meets certain criteria or constraints before further processing.
+
+- Data processing: Data processing refers to performing operations on collected or inputted data to obtain desired results or insights. This may involve calculations, transformations, filtering, sorting, searching algorithms (discussed later), etc.
+
+- Program output: Program output refers to any information that is generated by a program as a result of its execution. It can take different forms such as text printed on the screen/console/output file/graphical displays/etc.
+
+- Interface of the program: The interface of a program defines how users interact with it. This includes command-line interfaces (CLI) where users enter commands through terminal/console windows and graphical user interfaces (GUI) where users interact with buttons/menus/forms/etc.
+
+- Modularity: Modularity refers to breaking down a program into smaller modules or functions that perform specific tasks independently but work together when called upon by other parts/modules/functions within the program. This promotes code organization and reusability while making maintenance easier.
+
+- Reusability: Reusability means designing code in such a way that it can be easily reused in different parts of a program or in other programs altogether. This is achieved by creating modular and loosely-coupled code that can be easily integrated into different contexts.
+
+- Portability: Portability refers to the ability of a program to run on different platforms or operating systems without modification. It involves writing platform-independent code, using standard libraries, and avoiding platform-specific dependencies.
+
+- System development cycle: The system development cycle (also known as the software development life cycle) encompasses all phases/stages involved in developing a software/system, including requirements gathering/analysis, design, implementation/coding, testing/debugging, deployment/installation/maintenance.
+
+- Sorting and searching algorithms: Sorting algorithms are used to arrange elements in a specific order (e.g., ascending/descending). Common sorting algorithms include bubble sort, insertion sort, selection sort, merge sort, quicksort. Searching algorithms are used to find specific elements within data structures efficiently. Examples include linear search and binary search.
+
+These concepts provide a foundation for designing efficient programs with good structure and functionality while ensuring usability and maintainability throughout their lifecycle.
--- a/banson_hker/phase1-fix/doc/report.md
+++ b/banson_hker/phase1-fix/doc/report.md
@@ -0,0 +1,216 @@
+# Report:
+
+# describe the components involved
+
+
+## Purpose / Goals:
+
+Implement a Shift Cipher Decrypter using Python. 
+User can decrypt a long enough (more than 200 words) message which was originally `shift cipher encrypted`.
+
+For demonstration purpose, the remaining half "encryption" was also implemented. (which is the vice versa of decryption) 
+
+
+## Assumption / Requirement:
+
+Assuming a use case, for which message to be encrypted:
+
+    - Only contains upper case letters, space characters, punctuation marks
+    - Space characters remain unchanged during the encryption.
+    - punctuation marks remain unchanged during the encryption.
+    - message is longer than 200 words, alphabet distribution follows the general pattern
+
+
+## Procedure / Terminology:
+
+Given that the `encrypted` message is long enough. 
+A shift cipher decrypt can guess the message without knowing k.　
+The most frequent letter will be ‘E’ which is also reflected in the `encryped` message as letter shifting of the whole message(`encrypted`) are the same.
+
+
+## data types / variable declaration and initialization (data type used)
+
+### INTEGER
+
+i.e.
+
+```python
+ORD_a = ord('a') # 97
+```
+
+`ORA_a` is a variable storing `97` in integer format
+
+### STRING AND ARRAYS
+
+```python
+...    
+    with open(file_path,'r') as fi:
+        # beginning of the process
+        # read file and join the lines all
+        lines = fi.readlines()
+        e_temp = ''.join(lines)
+...
+```
+
+at here:
+- `lines` is an array of string.
+- `e_temp` is a single lined string.
+
+
+## data collection, input and validation/ data processing
+
+### User input:
+
+console
+
+<place a menu screen capture here>
+
+<place a menu screen capture here>
+
+press `1` to start a encryption (encrypt file)
+    - select a key you want (in numeric format)
+
+<place a menu screen capture here>
+
+press `2` to start a decryption (decrypt file)
+
+<place a menu screen capture here>
+
+press `q` to quit
+
+
+## modularity/ reusability/ portability
+
+(TODO: need to capture from text book)
+
+### Terminology
+
+- decryption algorithm (k guessing)
+
+### Background:
+
+Letter `e` is counted to have the most occurrence in daily english. As the process of `shift xxx encryption` is shifting letter by k(a unknown integer) times. Assuming the k used for the whole message are the same (or even in a regular pattern that already known). The letter occurrence will be reflected in the `encrypted` message (i.e. `e` -> k's shift -> `m` for this case) as well. That's why the k can be guessed by counting the most occurrence letter and assuming that is the letter `e` in the original message.
+
+### Implementation
+
+```python
+need to replace this !!!
+need to review comments !!!
+
+def find_max_occurrence(char_occurrences):
+    # find distance to the letter e (case in-sensitive)
+
+    # get the letter of the most occurrences. i.e. m
+    # by subtract between this letter to e, k can be guess
+
+    # find max occurrence and its index
+    max_idx = char_occurrences.index(max(char_occurrences))
+
+    # subtract it with index of e -> 4
+    return max_idx - 4
+
+def count_letter_occurrence(txt_in):
+    # letter e, as stated have the most occurrence in the message by statistics.
+    # as 'Shift Cipher' is a encryption by letter shifting, the letters have good chance 
+    # to have the most occurrence too in the encrypted text.
+    output = [0] * 26    # bucket for 26 letters
+
+    for char in txt_in:
+        if char.isalpha():
+            output[ord(char.lower()) - ORD_a] += 1
+
+    # output contains the statistics of paragraph letter by letter
+    return output
+
+...
+
+```
+
+<diagram showing for loop >
+
+
+```python
+need to replace this !!!
+need to review comments !!!
+
+def decrypt_file(file_path):
+    # will open an encrypted file and decrypt it by a guessed key
+    
+    with open(file_path,'r') as fi:
+        # beginning of the process
+        # read file and join the lines all
+        lines = fi.readlines()
+        e_temp = ''.join(lines)
+
+        characters_distribution = count_letter_occurrence(e_temp)
+
+        print('')
+        print('distribution of letters in encrypted text (case insensitive, from a to z)')
+        print(characters_distribution)
+
+        print('')
+        guess_k = find_max_occurrence(characters_distribution)
+        print(f'guessed k: {guess_k}')
+
+        print('')
+        print('decrypted text:')
+        decrypted_text = shift_cipher_decrypt(e_temp, guess_k)
+        print(decrypted_text)
+```
+
+<diagram showing for loop >
+
+```python
+need to replace this !!!
+need to review comments !!!
+
+
+def shift_cipher_decrypt(ciphertext, key):
+    plaintext = ""
+    
+    for char in ciphertext:
+        if char.isalpha():
+            ascii_offset = ORD_a if char.islower() else ORD_A  # Determine ASCII offset based on lowercase or uppercase letter
+            
+            # Calculate the distance of the target character from a or A
+            distance = ord(char) - ascii_offset
+
+            # Reverse the shift by subtracting the key and taking modulo 26 to wrap around 
+            shifted_distance = (distance - key) % 26
+
+            # Convert back to ASCII by adding the offset and get the corresponding character
+            decrypted_char = chr(shifted_distance + ascii_offset)
+
+            plaintext += decrypted_char
+        else:
+            # If it is not an alphabetic character, retain as is.
+            plaintext += char
+    
+    return plaintext
+```
+
+<diagram showing for loop >
+
+
+## progress
+
+### section 1
+    - week 1 
+    - week 2 
+    
+### section 2
+    - week 3 
+    - week 4 
+    - week 5 
+
+
+## References
+    - [](https://github.com/dwyl/english-words)
+### section 1
+    - [blablabal](http://www.google.com)
+    - [blablabal](http://www.google.com)
+
+### section 2
+    - [blablabal](http://www.google.com)
+    - [blablabal](http://www.google.com)
+    - [blablabal](http://www.google.com)