59 KiB
59 KiB
import re
import numpy as np
from collections import defaultdict
import torch
import torch.nn as nn
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
with open("stopwords.txt", "r+") as f:
stop_words = f.read().split("\n")
def clean_text(text):
split = text.lower().split(" ")
# removing punctuation
clean = []
for token in split:
token = re.sub(r'[^\w\s]', '', token)
if token:
clean.append(token)
return clean
def prepare_corpus(texts, min_count=1, min_word_len=1):
corpus = {}
counters = defaultdict(lambda: 0)
idx_counter = 0
for text in texts:
# add to corpus
for token in text:
if len(token) < min_word_len or token in stop_words:
continue
counters[token] += 1
if token not in corpus and counters[token] == min_count:
corpus[token] = idx_counter
idx_counter += 1
return corpus
counters = defaultdict(lambda: 0)
class WordCorpus:
def __init__(self, corpus=None, texts=None, min_count=1, min_word_len=1):
if corpus:
self.corpus = corpus
else:
self.corpus = prepare_corpus(texts, min_count, min_word_len)
def get_word_idx(self, token):
token = token.lower()
token = re.sub(r'[^\w\s]', '', token)
return self.corpus.get(token, None)
def get_embedding(self, token, encode=False):
embedding = np.zeros(len(self.corpus), dtype=np.int32)
if encode:
token_idx = token
else:
token = token.lower()
token = re.sub(r'[^\w\s]', '', token)
if not token or token not in self.corpus:
return embedding
token_idx = self.corpus[token]
embedding[token_idx] = 1
return embedding
def get_bow(self, text, encode=False):
if encode:
embeddings = [
self.get_embedding(token, encode) for token in text
]
return np.sum(embeddings, axis=0)
else:
bow = np.zeros(len(self.corpus), dtype=np.int32)
for token in text:
bow[token] += 1
return bow
def load_train_data(train_path):
texts = []
with open(train_path, "r+") as file:
while True:
line = file.readline()
if not line:
break
_, _, _, _, text, *_ = line.split("\t")
texts.append(clean_text(text))
print(f"Loaded {len(texts)} texts from train_set.")
return texts
class LanguageNeuralModel(nn.Module):
def __init__(self, corpus_size, hidden_size):
super().__init__()
self.input = nn.Linear(corpus_size, hidden_size)
self.hidden = nn.Linear(hidden_size, hidden_size)
self.output = nn.Linear(hidden_size, corpus_size)
def forward(self, x):
x = self.input(x)
x = F.relu(x)
x = self.hidden(x)
x = F.relu(x)
x = self.output(x)
return x
def get_random_word_with_contexts(text, context_size):
allowed_indexes = np.arange(context_size, len(text) - context_size)
if not len(allowed_indexes):
return None, None
word_idx = np.random.choice(allowed_indexes)
word = text[word_idx]
context = text[(word_idx - context_size):word_idx] + text[(word_idx + 1):(word_idx + 1 + context_size)]
return word, context
a = clean_text("Ala ma kota , kot pije mleko")
get_random_word_with_contexts(a, 2)
('kota', ['ala', 'ma', 'kot', 'pije'])
train_texts = load_train_data("drive/MyDrive/train.tsv")
Loaded 107471 texts from train_set.
corpus = WordCorpus(texts=train_texts, min_count=20, min_word_len=5)
len(corpus.corpus)
111418
def remove_words_outside_corpus_and_encode(text, corpus):
return [corpus.get_word_idx(token) for token in text if token in corpus.corpus]
train_texts = [remove_words_outside_corpus_and_encode(text, corpus) for text in train_texts]
BATCH_SIZE = 96
CONTEXT_SIZE = 15
import time
def get_batch(texts):
X, y = [], []
size = len(texts)
for _ in range(BATCH_SIZE):
word_idx = None
while word_idx is None:
text_idx = np.random.randint(size)
text = texts[text_idx]
word_idx, context = get_random_word_with_contexts(text, CONTEXT_SIZE)
bow = corpus.get_bow(context, encode=False)
X.append(bow)
y.append(word_idx)
r = (np.array(X) / (CONTEXT_SIZE * 2)).astype(np.float32), np.array(y).astype(np.int64)
return r
model = LanguageNeuralModel(len(corpus.corpus), 250)
model = model.to(device)
model.train()
LanguageNeuralModel( (input): Linear(in_features=111418, out_features=250, bias=True) (hidden): Linear(in_features=250, out_features=250, bias=True) (output): Linear(in_features=250, out_features=111418, bias=True) )
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
import tqdm
running_loss = 0.0
for i in tqdm.tqdm_notebook(range(20000)):
X, y = get_batch(train_texts)
X, y = torch.from_numpy(X).to(device), torch.from_numpy(y).to(device)
optimizer.zero_grad()
outputs = model(X)
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 500 == 499:
torch.save(model.state_dict(), "model.pth")
print('[%d, %5d] loss: %.3f' %
(1, i + 1, running_loss / 500))
running_loss = 0.0
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook` This is separate from the ipykernel package so we can avoid doing imports until
HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))
[1, 500] loss: 10.873 [1, 1000] loss: 10.559 [1, 1500] loss: 10.505 [1, 2000] loss: 10.437 [1, 2500] loss: 10.371 [1, 3000] loss: 10.371 [1, 3500] loss: 10.336 [1, 4000] loss: 10.338 [1, 4500] loss: 10.325 [1, 5000] loss: 10.325 [1, 5500] loss: 10.335 [1, 6000] loss: 10.366 [1, 6500] loss: 10.366 [1, 7000] loss: 10.377 [1, 7500] loss: 10.392 [1, 8000] loss: 10.422 [1, 8500] loss: 10.477 [1, 9000] loss: 10.525 [1, 9500] loss: 10.562 [1, 10000] loss: 10.593 [1, 10500] loss: 10.657 [1, 11000] loss: 10.711 [1, 11500] loss: 10.706 [1, 12000] loss: 10.781 [1, 12500] loss: 10.799 [1, 13000] loss: 10.875 [1, 13500] loss: 10.882 [1, 14000] loss: 10.921 [1, 14500] loss: 10.946 [1, 15000] loss: 10.979 [1, 15500] loss: 11.001 [1, 16000] loss: 11.032 [1, 16500] loss: 11.069 [1, 17000] loss: 11.090 [1, 17500] loss: 11.112 [1, 18000] loss: 11.119 [1, 18500] loss: 11.132 [1, 19000] loss: 11.212 [1, 19500] loss: 11.188 [1, 20000] loss: 11.213
model.eval()
LanguageNeuralModel( (input): Linear(in_features=111418, out_features=250, bias=True) (hidden): Linear(in_features=250, out_features=250, bias=True) (output): Linear(in_features=250, out_features=111418, bias=True) )
sets_to_eval = ["drive/MyDrive/dev0/", "drive/MyDrive/dev1/", "drive/MyDrive/test/"]
def load_test_data(test_path, corpus):
texts = []
with open(test_path, "r+") as file:
while True:
line = file.readline()
if not line:
break
_, _, left, right, *_ = line.split("\t")
texts.append(
(
remove_words_outside_corpus_and_encode(clean_text(left), corpus),
remove_words_outside_corpus_and_encode(clean_text(right), corpus)
)
)
print(f"Loaded {len(texts)} texts from train_set.")
return texts
words = list(corpus.corpus)
with torch.no_grad():
for path in sets_to_eval:
data = load_test_data(path + "in.tsv", corpus)
results = []
batch = []
for left, right in tqdm.tqdm_notebook(data):
if len(batch) < BATCH_SIZE:
context = left[-CONTEXT_SIZE:] + right[:CONTEXT_SIZE]
context = corpus.get_bow(context, encode=False)
batch.append(context)
continue
batch = (np.array(batch) / (2*CONTEXT_SIZE)).astype(np.float32)
X = torch.from_numpy(batch).to(device)
out = F.softmax(model(X)).tolist()[0]
indexes = list(range(len(corpus.corpus)))
indexes = sorted(indexes, key=lambda x: out[x], reverse=True)
res = ""
prob0 = 1.
for idx in indexes[:10000]:
prob0 -= out[idx]
res += f"{words[idx]}:{np.log(out[idx])} "
res += f":{np.log(prob0)}"
results.append(res)
batch = []
with open(path + "out.tsv", "w+") as f:
f.write("\n".join(results))
Loaded 19986 texts from train_set.
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:8: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
HBox(children=(FloatProgress(value=0.0, max=19986.0), HTML(value='')))
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:16: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. app.launch_new_instance()
Loaded 11628 texts from train_set.
HBox(children=(FloatProgress(value=0.0, max=11628.0), HTML(value='')))
Loaded 14132 texts from train_set.
HBox(children=(FloatProgress(value=0.0, max=14132.0), HTML(value='')))