59 KiB
59 KiB
import re
import numpy as np
from collections import defaultdict
import torch
import torch.nn as nn
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
with open("stopwords.txt", "r+") as f:
stop_words = f.read().split("\n")
def clean_text(text):
split = text.lower().split(" ")
# removing punctuation
clean = []
for token in split:
token = re.sub(r'[^\w\s]', '', token)
if token:
clean.append(token)
return clean
def prepare_corpus(texts, min_count=1, min_word_len=1):
corpus = {}
counters = defaultdict(lambda: 0)
idx_counter = 0
for text in texts:
# add to corpus
for token in text:
if len(token) < min_word_len or token in stop_words:
continue
counters[token] += 1
if token not in corpus and counters[token] == min_count:
corpus[token] = idx_counter
idx_counter += 1
return corpus
counters = defaultdict(lambda: 0)
class WordCorpus:
def __init__(self, corpus=None, texts=None, min_count=1, min_word_len=1):
if corpus:
self.corpus = corpus
else:
self.corpus = prepare_corpus(texts, min_count, min_word_len)
def get_word_idx(self, token):
token = token.lower()
token = re.sub(r'[^\w\s]', '', token)
return self.corpus.get(token, None)
def get_embedding(self, token, encode=False):
embedding = np.zeros(len(self.corpus), dtype=np.int32)
if encode:
token_idx = token
else:
token = token.lower()
token = re.sub(r'[^\w\s]', '', token)
if not token or token not in self.corpus:
return embedding
token_idx = self.corpus[token]
embedding[token_idx] = 1
return embedding
def get_bow(self, text, encode=False):
if encode:
embeddings = [
self.get_embedding(token, encode) for token in text
]
return np.sum(embeddings, axis=0)
else:
bow = np.zeros(len(self.corpus), dtype=np.int32)
for token in text:
bow[token] += 1
return bow
def load_train_data(train_path):
texts = []
with open(train_path, "r+") as file:
while True:
line = file.readline()
if not line:
break
_, _, _, _, text, *_ = line.split("\t")
texts.append(clean_text(text))
print(f"Loaded {len(texts)} texts from train_set.")
return texts
class LanguageNeuralModel(nn.Module):
def __init__(self, corpus_size, hidden_size):
super().__init__()
self.input = nn.Linear(corpus_size, hidden_size)
self.hidden = nn.Linear(hidden_size, hidden_size)
self.output = nn.Linear(hidden_size, corpus_size)
def forward(self, x):
x = self.input(x)
x = F.relu(x)
x = self.hidden(x)
x = F.relu(x)
x = self.output(x)
return x
def get_random_word_with_contexts(text, context_size):
allowed_indexes = np.arange(context_size, len(text) - context_size)
if not len(allowed_indexes):
return None, None
word_idx = np.random.choice(allowed_indexes)
word = text[word_idx]
context = text[(word_idx - context_size):word_idx] + text[(word_idx + 1):(word_idx + 1 + context_size)]
return word, context
a = clean_text("Ala ma kota , kot pije mleko")
get_random_word_with_contexts(a, 2)
('kot', ['ma', 'kota', 'pije', 'mleko'])
train_texts = load_train_data("drive/MyDrive/train.tsv")
Loaded 107471 texts from train_set.
corpus = WordCorpus(texts=train_texts, min_count=20, min_word_len=5)
len(corpus.corpus)
111418
def remove_words_outside_corpus_and_encode(text, corpus):
return [corpus.get_word_idx(token) for token in text if token in corpus.corpus]
train_texts = [remove_words_outside_corpus_and_encode(text, corpus) for text in train_texts]
BATCH_SIZE = 96
CONTEXT_SIZE = 15
import time
def get_batch(texts):
X, y = [], []
size = len(texts)
for _ in range(BATCH_SIZE):
word_idx = None
while word_idx is None:
text_idx = np.random.randint(size)
text = texts[text_idx]
word_idx, context = get_random_word_with_contexts(text, CONTEXT_SIZE)
bow = corpus.get_bow(context, encode=False)
X.append(bow)
y.append(word_idx)
r = (np.array(X) / (CONTEXT_SIZE * 2)).astype(np.float32), np.array(y).astype(np.int64)
return r
model = LanguageNeuralModel(len(corpus.corpus), 250)
model = model.to(device)
model.load_state_dict(torch.load("drive/MyDrive/model.pth"))
<All keys matched successfully>
model.train()
LanguageNeuralModel( (input): Linear(in_features=111418, out_features=250, bias=True) (hidden): Linear(in_features=250, out_features=250, bias=True) (output): Linear(in_features=250, out_features=111418, bias=True) )
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
import tqdm
running_loss = 0.0
for i in tqdm.tqdm_notebook(range(10000)):
X, y = get_batch(train_texts)
X, y = torch.from_numpy(X).to(device), torch.from_numpy(y).to(device)
optimizer.zero_grad()
outputs = model(X)
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 500 == 499:
torch.save(model.state_dict(), "model.pth")
print('[%d, %5d] loss: %.3f' %
(1, i + 1, running_loss / 500))
running_loss = 0.0
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook` This is separate from the ipykernel package so we can avoid doing imports until
HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))
[1, 500] loss: 11.095 [1, 1000] loss: 11.138 [1, 1500] loss: 11.202 [1, 2000] loss: 11.237 [1, 2500] loss: 11.209 [1, 3000] loss: 11.261 [1, 3500] loss: 11.302 [1, 4000] loss: 11.303 [1, 4500] loss: 11.283 [1, 5000] loss: 11.305 [1, 5500] loss: 11.321 [1, 6000] loss: 11.348 [1, 6500] loss: 11.335 [1, 7000] loss: 11.272 [1, 7500] loss: 11.347 [1, 8000] loss: 11.320 [1, 8500] loss: 11.301 [1, 9000] loss: 11.307 [1, 9500] loss: 11.310 [1, 10000] loss: 11.274
model.eval()
LanguageNeuralModel( (input): Linear(in_features=111418, out_features=250, bias=True) (hidden): Linear(in_features=250, out_features=250, bias=True) (output): Linear(in_features=250, out_features=111418, bias=True) )
sets_to_eval = ["drive/MyDrive/dev0/", "drive/MyDrive/dev1/", "drive/MyDrive/test/"]
def load_test_data(test_path, corpus):
texts = []
with open(test_path, "r+") as file:
while True:
line = file.readline()
if not line:
break
_, _, left, right, *_ = line.split("\t")
texts.append(
(
remove_words_outside_corpus_and_encode(clean_text(left), corpus),
remove_words_outside_corpus_and_encode(clean_text(right), corpus)
)
)
print(f"Loaded {len(texts)} texts from train_set.")
return texts
words = list(corpus.corpus)
with torch.no_grad():
for path in sets_to_eval:
results = []
data = load_test_data(path + "in.tsv", corpus)
batch = []
for left, right in tqdm.tqdm_notebook(data):
context = left[-CONTEXT_SIZE:] + right[:CONTEXT_SIZE]
context = corpus.get_bow(context, encode=False)
batch.append(context)
if len(batch) < BATCH_SIZE:
continue
batch = (np.array(batch) / (2 * CONTEXT_SIZE)).astype(np.float32)
X = torch.from_numpy(batch).to(device)
out_all = F.softmax(model(X)).tolist()
for pred_idx in range(BATCH_SIZE):
out = out_all[pred_idx]
indexes = list(range(len(corpus.corpus)))
indexes = sorted(indexes, key=lambda x: out[x], reverse=True)
with open(path + "out.tsv", "a+") as f:
res = ""
prob0 = 1.
for idx in indexes[:500]:
prob0 -= out[idx]
res += f"{words[idx]}:{np.log(out[idx])} "
res += f":{np.log(prob0)}\n"
f.write(res)
batch = []
Loaded 19986 texts from train_set.
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:8: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
HBox(children=(FloatProgress(value=0.0, max=19986.0), HTML(value='')))
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:16: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. app.launch_new_instance()
Loaded 11628 texts from train_set.
HBox(children=(FloatProgress(value=0.0, max=11628.0), HTML(value='')))
Loaded 14132 texts from train_set.
HBox(children=(FloatProgress(value=0.0, max=14132.0), HTML(value='')))