3.9 KiB
3.9 KiB
!unxz challenging-america-word-gap-prediction/train/in.tsv.xz --keep
!unxz challenging-america-word-gap-prediction/test-A/in.tsv.xz --keep
!unxz challenging-america-word-gap-prediction/dev-0/in.tsv.xz --keep
unxz: challenging-america-word-gap-prediction/train/in.tsv.xz: No such file or directory unxz: challenging-america-word-gap-prediction/test-A/in.tsv.xz: No such file or directory unxz: challenging-america-word-gap-prediction/dev-0/in.tsv.xz: No such file or directory
!ls challenging-america-word-gap-prediction/train
expected.tsv in.tsv
import nltk
def get_texts():
with open("challenging-america-word-gap-prediction/train/in.tsv", "r", encoding="UTF-8") as f:
i = 0
while True:
i+=1
text = f.readline()
if(text == None or i > 10):
break
text = text.split('\t')[6]
text = text.replace("-\n", "").replace("\n", " ")
yield
# def get_words():
# for text in get_texts():
# for word in nltk.word_tokenize(text):
# yield word
def get_labels():
with open("challenging-america-word-gap-prediction/train/expected.tsv", "r", encoding="UTF-8") as f:
yield from f.readlines()[0:10]
texts_sum = sum(1 for text in get_texts())
labels_sum = sum(1 for label in get_labels())
# words_sum = sum(1 for word in get_words())
print(f"All texts: {texts_sum}")
print(f"All labels: {labels_sum}")
# print(f"All words: {words_sum}")
All texts: 10 All labels: 10
for text in get_texts():
print(text)
None None None None None None None None None None
Model bigramowy odwrotny
class Model():
def __init__(self, vocab_size, UNK_token= '<UNK>'):
pass
def train(corpus:list) -> None:
pass
def predict(text: list, probs: str) -> float:
pass