50 lines
1.4 KiB
Python
50 lines
1.4 KiB
Python
import string
|
|
|
|
|
|
def bpe(sentence: str, V:int) -> list:
|
|
# remove punctuation
|
|
sentence = sentence.translate(str.maketrans("", "", string.punctuation)).lower()
|
|
characters = [i if i != " " else "<w>" for i in sentence]
|
|
characters.append("<w>")
|
|
characters.insert(0, "<w>")
|
|
vocabulary = set(characters)
|
|
while len(vocabulary) < V:
|
|
bigrams = get_bigrams(characters)
|
|
frequencies = get_frequencies(bigrams)
|
|
most_freq_bigr = max(frequencies,key=frequencies.get)
|
|
upd_sentence_with_bigram(characters,most_freq_bigr)
|
|
vocabulary.add(most_freq_bigr)
|
|
return vocabulary
|
|
|
|
|
|
|
|
def get_bigrams(characters: list) -> list:
|
|
bigrams = []
|
|
for i in range(0, len(characters) - 1):
|
|
bigrams.append(characters[i] + characters[i + 1])
|
|
return bigrams
|
|
|
|
|
|
def get_frequencies(item: list) -> dict:
|
|
frequencies = {}
|
|
for i in item:
|
|
if i not in frequencies:
|
|
frequencies[i] = 1
|
|
else:
|
|
frequencies[i] += 1
|
|
return frequencies
|
|
|
|
def upd_sentence_with_bigram(chars:list, bigram:str) -> list:
|
|
i = 0
|
|
while i < len(chars) - 1:
|
|
if chars[i] + chars[i + 1] == bigram:
|
|
chars.pop(i + 1)
|
|
chars.pop(i)
|
|
chars.insert(i, bigram)
|
|
i+=1
|
|
return chars
|
|
|
|
usr_input = input('Podaj zdanie: ')
|
|
V = int(input('Podaj wielkosc slownika: '))
|
|
vocab = bpe(usr_input, V)
|
|
for i in vocab: print(i) |