import re import itertools import string import matplotlib.pyplot as plt import random ALPHABET = string.ascii_lowercase def lclear(text): return re.sub(r'[^a-z]', '', text.lower()) def read_file(filename): counter = 0 longtxt = '' with open(filename, encoding='utf8') as f: for line in f: longtxt += lclear(line.strip()) counter += 1 f.close() return lclear(longtxt) def write_to_file(filename, content): with open(filename, 'w') as f: current_line_length = 0 for word in content: word_length = len(word) while word_length > 0: if current_line_length + word_length > 200: f.write('\n') current_line_length = 0 chars_to_write = min(word_length, 200 - current_line_length) f.write(word[:chars_to_write] + '') current_line_length += chars_to_write word = word[chars_to_write:] word_length -= chars_to_write def generate_n_grams(n, alphabet): # it creates all possible grams of len n from given alphabet and puts it in a dict with each value 0 combinations = itertools.product(alphabet, repeat=n) n_grams_dict_0 = {''.join(combination): 0 for combination in combinations} return n_grams_dict_0 def calculate_n_grams_freq(n: int, inputFile, alphabet: str): grams = generate_n_grams(n, alphabet) # file_content = read_file(inputFile) file_content = inputFile total_n_grams = len(file_content) - n + 1 for i in range(total_n_grams): # after this loop the dict grams contains the NUMBER of how many times particular ngram occurs in text ngram = file_content[i:i+n] # print('gram: ',ngram) grams[ngram] += 1 for key in list(grams.keys()): grams[key] /= total_n_grams # print('total ngrams freq:', total_n_grams) grams = {key: value for key, value in grams.items() if value != 0} return dict(sorted(grams.items(), key=lambda x: x[1], reverse=True)) # sorts by number of particular ngram def compute_fitness(n, filename, frequencies): text = read_file(filename) total_n_grams = len(text) - n + 1 score = 0.0 for i in range(total_n_grams): current_gram = text[i:i+n] # print('cuttenr gram', current_gram) score += frequencies.get(current_gram, 0) # print('total ngrams sample:', total_n_grams) return round(((score / total_n_grams) * 100), 8) def create_bar_diagram(values): x_positions = range(len(values)) labels = ['plaintxt', 'random', 'cesar', 'substitution', 'vinegere', 'hill'] # Labels for each bar plt.bar(x_positions, values) # Add labels to the x-axis and y-axis plt.xlabel('sample text') plt.ylabel('fitness rating') # Add a title to the bar diagram plt.title('Fitness using quadgrams') # Set the x-axis tick positions and labels plt.xticks(x_positions, labels) plt.show() n=2 # n-grams eng_freq = calculate_n_grams_freq(n, read_file('data/Ulysses.txt'), ALPHABET) fit_plain = compute_fitness(n, 'data/Ulysses.txt', eng_freq) fit_random = compute_fitness(n, 'data/input_random.txt', eng_freq) fit_cesar = compute_fitness(n, 'data/input_cesar.txt', eng_freq) fit_subs = compute_fitness(n, 'data/input_subs.txt', eng_freq) fit_ving = compute_fitness(n, 'data/input_vinegere.txt', eng_freq) fit_hill = compute_fitness(n, 'data/input_hill.txt', eng_freq) print(fit_plain) print(fit_random) print(fit_cesar) print(fit_subs) print(fit_ving) values = [fit_plain, fit_random, fit_cesar, fit_subs, fit_ving, fit_hill] write_to_file('data/text.txt', lclear(read_file('data/input_plaintext.txt'))) create_bar_diagram(values)