Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
f26ab9bdbe | ||
|
81bd23dbcb | ||
|
11867437bf | ||
|
c3ce71c113 |
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
42
run.py
Normal file
42
run.py
Normal file
@ -0,0 +1,42 @@
|
||||
import pandas as pd
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
|
||||
# * Training data loading
|
||||
with open('train/in.tsv', 'r', encoding='utf-8') as f:
|
||||
x_train = pd.DataFrame(f.readlines(), columns=['text'])
|
||||
y_train = pd.read_csv('train/expected.tsv', sep='\t',
|
||||
names=['paranormal'], encoding='utf-8')
|
||||
# *Validation data loading
|
||||
with open('dev-0/in.tsv', 'r', encoding='utf-8') as f:
|
||||
x_dev = pd.DataFrame(f.readlines(), columns=['text'])
|
||||
# * Test data loading
|
||||
with open('test-A/in.tsv', 'r', encoding='utf-8') as f:
|
||||
x_test = pd.DataFrame(f.readlines(), columns=['text'])
|
||||
|
||||
# * Training data preparation
|
||||
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, max_features=500)
|
||||
x_train_vectorized = tfidf_vectorizer.fit_transform(
|
||||
x_train['text'].values)
|
||||
|
||||
# * Model training
|
||||
mnb_model = MultinomialNB().fit(x_train_vectorized, y_train.values.ravel())
|
||||
|
||||
# * Validation data preparation
|
||||
x_dev_prepared = tfidf_vectorizer.transform(x_dev['text'].values)
|
||||
# * Validation data predictions
|
||||
predictions = mnb_model.predict(x_dev_prepared)
|
||||
# * Validation predicitons saving
|
||||
with open('dev-0/out.tsv', 'w') as f:
|
||||
for pred in predictions:
|
||||
f.write(f'{pred}\n')
|
||||
|
||||
# * Test data preparation
|
||||
x_test_vectorized = tfidf_vectorizer.transform(x_test['text'].values)
|
||||
# * Test data predictions
|
||||
predictions = mnb_model.predict(x_test_vectorized)
|
||||
# * Test predictions saving
|
||||
with open('test-A/out.tsv', 'w') as f:
|
||||
for pred in predictions:
|
||||
f.write(f'{pred}\n')
|
149
run_pytorch.py
Normal file
149
run_pytorch.py
Normal file
@ -0,0 +1,149 @@
|
||||
import torch
|
||||
import numpy as np
|
||||
from gensim.models import Word2Vec
|
||||
import lzma
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class ScepticNetwork(torch.nn.Module):
|
||||
|
||||
def __init__(self, features=100):
|
||||
super(ScepticNetwork, self).__init__()
|
||||
self.lin_1 = torch.nn.Linear(features, 500)
|
||||
self.lin_2 = torch.nn.Linear(500, 1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.lin_1(x)
|
||||
x = torch.relu(x)
|
||||
x = self.lin_2(x)
|
||||
x = torch.sigmoid(x)
|
||||
return x
|
||||
|
||||
|
||||
|
||||
def evaluate(model, X, Y, criterion, batch_size):
|
||||
loss_score = 0
|
||||
acc_score = 0
|
||||
items_total = 0
|
||||
model.eval()
|
||||
for i in range(0, Y.shape[0], batch_size):
|
||||
X_tens = torch.tensor(X[i:i + batch_size].astype(np.float32))
|
||||
Y_tens = torch.tensor(Y[i:i + batch_size].astype(np.float32)).reshape(
|
||||
-1, 1)
|
||||
Y_predictions = model(X_tens)
|
||||
acc_score += torch.sum((Y_predictions > 0.5) == Y_tens).item()
|
||||
items_total += Y_tens.shape[0]
|
||||
|
||||
loss = criterion(Y_predictions, Y_tens)
|
||||
|
||||
loss_score += loss.item() * Y_tens.shape[0]
|
||||
return (loss_score / items_total), (acc_score / items_total)
|
||||
|
||||
|
||||
def train(model,
|
||||
x_train,
|
||||
y_train,
|
||||
optimizer,
|
||||
criterion=torch.nn.BCELoss(),
|
||||
epochs=5,
|
||||
batch_size=256):
|
||||
for epoch in range(epochs):
|
||||
loss_score = 0
|
||||
acc_score = 0
|
||||
items_total = 0
|
||||
model.train()
|
||||
for i in range(0, len(y_train), batch_size):
|
||||
X_tens = torch.tensor(x_train[i:i + batch_size].astype(np.float32))
|
||||
Y_tens = torch.tensor(y_train[i:i + batch_size].astype(
|
||||
np.float32)).reshape(-1, 1)
|
||||
Y_predictions = model(X_tens)
|
||||
acc_score += torch.sum((Y_predictions > 0.5) == Y_tens).item()
|
||||
items_total += Y_tens.shape[0]
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(Y_predictions, Y_tens)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
loss_score += loss.item() * Y_tens.shape[0]
|
||||
|
||||
print(f'Epoch {epoch+1}/{epochs}')
|
||||
loss, accuracy = evaluate(model, x_train, y_train, criterion,
|
||||
batch_size)
|
||||
print(f'Train set\nloss = {loss}, accuracy = {accuracy}')
|
||||
|
||||
|
||||
def flatten(t):
|
||||
return [str(int(item)) for sublist in t for item in sublist]
|
||||
|
||||
|
||||
def predict(model, data):
|
||||
data = torch.tensor(data.astype(np.float32))
|
||||
with torch.no_grad():
|
||||
return flatten(model(data).round().tolist())
|
||||
|
||||
|
||||
PATHS = ['train/in.tsv', 'dev-0/in.tsv', 'test-A/in.tsv']
|
||||
|
||||
|
||||
def read_data(path, train=True):
|
||||
print(f"I am reading the data from {path}...")
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
if train:
|
||||
data = [line.strip().split() for line in f.readlines()]
|
||||
else:
|
||||
data = [line.strip() for line in f.readlines()]
|
||||
print("Data loaded")
|
||||
return data
|
||||
|
||||
|
||||
def save_predictions(path, preds):
|
||||
new_path = f"{path.split('/')[0]}/out.pt.tsv"
|
||||
print(f"Saving predictions to {new_path}")
|
||||
with open(new_path, 'w') as f:
|
||||
for line in preds:
|
||||
f.write(f'{line}\n')
|
||||
|
||||
|
||||
def vectorize_data(data, vectorizer):
|
||||
result = [
|
||||
np.mean([
|
||||
vectorizer.wv[word]
|
||||
if word in vectorizer.wv else np.zeros(100, dtype=float)
|
||||
for word in doc
|
||||
],
|
||||
axis=0) for doc in data
|
||||
]
|
||||
return np.array(result)
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# * Load training data
|
||||
data = read_data(PATHS[0])
|
||||
x_train = np.array(data)
|
||||
y_train = np.array(read_data('train/expected.tsv', False))
|
||||
print(
|
||||
f"X_data: {x_train[:5]} {type(x_train)}, y_data: {y_train[:5]} {type(y_train)}\nx shape:{x_train.shape}\ty shape: {y_train.shape}"
|
||||
)
|
||||
|
||||
# * Vectorize data
|
||||
w2v = Word2Vec(x_train, vector_size=100, min_count=2)
|
||||
x_train_vec = vectorize_data(x_train, w2v)
|
||||
|
||||
# * Loading & training model
|
||||
model = ScepticNetwork()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.15)
|
||||
print("Now I will train the model...")
|
||||
train(model, x_train_vec, y_train, epochs=50, optimizer=optimizer)
|
||||
print("Training completed!\n\n")
|
||||
|
||||
# * Making predictions
|
||||
for path in PATHS[1:]:
|
||||
X = vectorize_data(read_data(path), w2v)
|
||||
print(f"I will make predictions for {path}")
|
||||
predictions = predict(model, X)
|
||||
print(f'Saving predictions for {path}')
|
||||
save_predictions(path,predictions)
|
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user