challenging-america-word-ga.../notebook.ipynb
Anna Nowak 3a9168b302 tmp
2022-04-03 21:06:13 +02:00

3.9 KiB

!unxz challenging-america-word-gap-prediction/train/in.tsv.xz --keep
!unxz challenging-america-word-gap-prediction/test-A/in.tsv.xz --keep
!unxz challenging-america-word-gap-prediction/dev-0/in.tsv.xz --keep
unxz: challenging-america-word-gap-prediction/train/in.tsv.xz: No such file or directory
unxz: challenging-america-word-gap-prediction/test-A/in.tsv.xz: No such file or directory
unxz: challenging-america-word-gap-prediction/dev-0/in.tsv.xz: No such file or directory
!ls challenging-america-word-gap-prediction/train
expected.tsv  in.tsv
import nltk
def get_texts():
    with open("challenging-america-word-gap-prediction/train/in.tsv", "r", encoding="UTF-8") as f:
        i = 0
        while True:
            i+=1
            text = f.readline()
            if(text == None or i > 10):
                break
            text = text.split('\t')[6]
            text = text.replace("-\n", "").replace("\n", " ")
            yield 

# def get_words():
#     for text in get_texts():
#         for word in nltk.word_tokenize(text):
#             yield word

def get_labels():
    with open("challenging-america-word-gap-prediction/train/expected.tsv", "r", encoding="UTF-8") as f:
        yield from f.readlines()[0:10]

texts_sum = sum(1 for text in get_texts())
labels_sum = sum(1 for label in get_labels())
# words_sum = sum(1 for word in get_words())
print(f"All texts: {texts_sum}")
print(f"All labels: {labels_sum}")
# print(f"All words: {words_sum}")
All texts: 10
All labels: 10
for text in get_texts():
    print(text)
None
None
None
None
None
None
None
None
None
None

Model bigramowy odwrotny

class Model():
    def __init__(self, vocab_size, UNK_token= '<UNK>'):
        pass
    
    def train(corpus:list) -> None:
        pass
    
    def predict(text: list, probs: str) -> float:
        pass