przetwarzanie_jezyka_natura.../E/bpe.py

50 lines
1.4 KiB
Python
Raw Normal View History

2023-01-19 21:22:01 +01:00
import string
def bpe(sentence: str, V:int) -> list:
# remove punctuation
sentence = sentence.translate(str.maketrans("", "", string.punctuation)).lower()
characters = [i if i != " " else "<w>" for i in sentence]
characters.append("<w>")
characters.insert(0, "<w>")
vocabulary = set(characters)
while len(vocabulary) < V:
bigrams = get_bigrams(characters)
frequencies = get_frequencies(bigrams)
most_freq_bigr = max(frequencies,key=frequencies.get)
upd_sentence_with_bigram(characters,most_freq_bigr)
vocabulary.add(most_freq_bigr)
return vocabulary
def get_bigrams(characters: list) -> list:
bigrams = []
for i in range(0, len(characters) - 1):
bigrams.append(characters[i] + characters[i + 1])
return bigrams
def get_frequencies(item: list) -> dict:
frequencies = {}
for i in item:
if i not in frequencies:
frequencies[i] = 1
else:
frequencies[i] += 1
return frequencies
def upd_sentence_with_bigram(chars:list, bigram:str) -> list:
i = 0
while i < len(chars) - 1:
if chars[i] + chars[i + 1] == bigram:
chars.pop(i + 1)
chars.pop(i)
chars.insert(i, bigram)
i+=1
return chars
usr_input = input('Podaj zdanie: ')
V = int(input('Podaj wielkosc slownika: '))
vocab = bpe(usr_input, V)
for i in vocab: print(i)