ngrams-fitness/fitness_ngrams.py

130 lines
3.7 KiB
Python

import re
import itertools
import string
import matplotlib.pyplot as plt
import random
ALPHABET = string.ascii_lowercase
def lclear(text):
return re.sub(r'[^a-z]', '', text.lower())
def read_file(filename):
counter = 0
longtxt = ''
with open(filename, encoding='utf8') as f:
for line in f:
longtxt += lclear(line.strip())
counter += 1
f.close()
return lclear(longtxt)
def write_to_file(filename, content):
with open(filename, 'w') as f:
current_line_length = 0
for word in content:
word_length = len(word)
while word_length > 0:
if current_line_length + word_length > 200:
f.write('\n')
current_line_length = 0
chars_to_write = min(word_length, 200 - current_line_length)
f.write(word[:chars_to_write] + '')
current_line_length += chars_to_write
word = word[chars_to_write:]
word_length -= chars_to_write
def generate_n_grams(n, alphabet): # it creates all possible grams of len n from given alphabet and puts it in a dict with each value 0
combinations = itertools.product(alphabet, repeat=n)
n_grams_dict_0 = {''.join(combination): 0 for combination in combinations}
return n_grams_dict_0
def calculate_n_grams_freq(n: int, inputFile, alphabet: str):
grams = generate_n_grams(n, alphabet)
# file_content = read_file(inputFile)
file_content = inputFile
total_n_grams = len(file_content) - n + 1
for i in range(total_n_grams): # after this loop the dict grams contains the NUMBER of how many times particular ngram occurs in text
ngram = file_content[i:i+n]
# print('gram: ',ngram)
grams[ngram] += 1
for key in list(grams.keys()):
grams[key] /= total_n_grams
# print('total ngrams freq:', total_n_grams)
grams = {key: value for key, value in grams.items() if value != 0}
return dict(sorted(grams.items(), key=lambda x: x[1], reverse=True)) # sorts by number of particular ngram
def compute_fitness(n, filename, frequencies):
text = read_file(filename)
total_n_grams = len(text) - n + 1
score = 0.0
for i in range(total_n_grams):
current_gram = text[i:i+n]
# print('cuttenr gram', current_gram)
score += frequencies.get(current_gram, 0)
# print('total ngrams sample:', total_n_grams)
return round(((score / total_n_grams) * 100), 8)
def create_bar_diagram(values):
x_positions = range(len(values))
labels = ['plaintxt', 'random', 'cesar', 'substitution', 'vinegere', 'hill'] # Labels for each bar
plt.bar(x_positions, values)
# Add labels to the x-axis and y-axis
plt.xlabel('sample text')
plt.ylabel('fitness rating')
# Add a title to the bar diagram
plt.title('Fitness using quadgrams')
# Set the x-axis tick positions and labels
plt.xticks(x_positions, labels)
plt.show()
n=2 # n-grams
eng_freq = calculate_n_grams_freq(n, read_file('data/Ulysses.txt'), ALPHABET)
fit_plain = compute_fitness(n, 'data/Ulysses.txt', eng_freq)
fit_random = compute_fitness(n, 'data/input_random.txt', eng_freq)
fit_cesar = compute_fitness(n, 'data/input_cesar.txt', eng_freq)
fit_subs = compute_fitness(n, 'data/input_subs.txt', eng_freq)
fit_ving = compute_fitness(n, 'data/input_vinegere.txt', eng_freq)
fit_hill = compute_fitness(n, 'data/input_hill.txt', eng_freq)
print(fit_plain)
print(fit_random)
print(fit_cesar)
print(fit_subs)
print(fit_ving)
values = [fit_plain, fit_random, fit_cesar, fit_subs, fit_ving, fit_hill]
write_to_file('data/text.txt', lclear(read_file('data/input_plaintext.txt')))
create_bar_diagram(values)