import string def bpe(sentence: str, V:int) -> list: # remove punctuation sentence = sentence.translate(str.maketrans("", "", string.punctuation)).lower() characters = [i if i != " " else "" for i in sentence] characters.append("") characters.insert(0, "") vocabulary = set(characters) while len(vocabulary) < V: bigrams = get_bigrams(characters) frequencies = get_frequencies(bigrams) most_freq_bigr = max(frequencies,key=frequencies.get) upd_sentence_with_bigram(characters,most_freq_bigr) vocabulary.add(most_freq_bigr) return vocabulary def get_bigrams(characters: list) -> list: bigrams = [] for i in range(0, len(characters) - 1): bigrams.append(characters[i] + characters[i + 1]) return bigrams def get_frequencies(item: list) -> dict: frequencies = {} for i in item: if i not in frequencies: frequencies[i] = 1 else: frequencies[i] += 1 return frequencies def upd_sentence_with_bigram(chars:list, bigram:str) -> list: i = 0 while i < len(chars) - 1: if chars[i] + chars[i + 1] == bigram: chars.pop(i + 1) chars.pop(i) chars.insert(i, bigram) i+=1 return chars usr_input = input('Podaj zdanie: ') V = int(input('Podaj wielkosc slownika: ')) vocab = bpe(usr_input, V) for i in vocab: print(i)