Compare commits
2 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
93adc7c664 | ||
|
eb6976bcd9 |
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
51
solution.py
Normal file
51
solution.py
Normal file
@ -0,0 +1,51 @@
|
||||
import numpy
|
||||
import lzma
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn import preprocessing
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
|
||||
TEST_A = "test-A"
|
||||
DEV_0 = "dev-0"
|
||||
TRAIN_IN = "./train/in.tsv.xz"
|
||||
TRAIN_EXPECTED = "./train/expected.tsv"
|
||||
|
||||
|
||||
def open_file(path):
|
||||
with open(path) as file:
|
||||
return file.readlines()
|
||||
|
||||
|
||||
def open_xz(path):
|
||||
with lzma.open(path, 'rt') as file:
|
||||
return file.readlines()
|
||||
|
||||
|
||||
def get_model(train_in, train_expected):
|
||||
label_encoder = preprocessing.LabelEncoder()
|
||||
train_expected = label_encoder.fit_transform(train_expected)
|
||||
pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())
|
||||
model = pipeline.fit(train_in, train_expected)
|
||||
return model
|
||||
|
||||
|
||||
def predict(train_test_in_path, train_in_path, train_expected_path):
|
||||
train_in = open_xz(train_in_path)
|
||||
train_expected = open_file(train_expected_path)
|
||||
train_test_in = open_xz(train_test_in_path + '/in.tsv.xz')
|
||||
model = get_model(train_in, train_expected)
|
||||
prediction = model.predict(train_test_in)
|
||||
return prediction
|
||||
|
||||
|
||||
def save_result(path, prediction):
|
||||
numpy.savetxt(path + "/out.tsv", prediction, '%d')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
prediction_dev_0 = predict(DEV_0, TRAIN_IN, TRAIN_EXPECTED)
|
||||
prediction_test_a = predict(TEST_A, TRAIN_IN, TRAIN_EXPECTED)
|
||||
|
||||
save_result(DEV_0, prediction_dev_0)
|
||||
save_result(TEST_A, prediction_test_a)
|
134
solution_2.py
Normal file
134
solution_2.py
Normal file
@ -0,0 +1,134 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
import csv
|
||||
from nltk.tokenize import word_tokenize
|
||||
from gensim.models import Word2Vec
|
||||
import gensim.downloader
|
||||
|
||||
CONTENT = 'content'
|
||||
ID = 'id'
|
||||
LABEL = 'label'
|
||||
|
||||
col_names = [CONTENT, ID, LABEL]
|
||||
word2vec = gensim.downloader.load('word2vec-google-news-300')
|
||||
BATCH_SIZE = 10
|
||||
TRAIN_IN_PATH = 'train/in.tsv.xz'
|
||||
TRAIN_EXP_PATH = 'train/expected.tsv'
|
||||
DEV_PATH = 'dev-0/in.tsv.xz'
|
||||
TEST_PATH = 'test-A/in.tsv.xz'
|
||||
DEV_OUT_PATH = './dev-0/out.tsv'
|
||||
TEST_OUT_PATH = './test-A/out.tsv'
|
||||
INPUT_SIZE = 300
|
||||
HIDDEN_SIZE = 600
|
||||
NUM_CLASSES = 1
|
||||
|
||||
|
||||
class NeuralNetwork(torch.nn.Module):
|
||||
def __init__(self, input_size, hidden_size, num_classes):
|
||||
super(NeuralNetwork, self).__init__()
|
||||
self.l1 = torch.nn.Linear(input_size, hidden_size)
|
||||
self.l2 = torch.nn.Linear(hidden_size, num_classes)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.l1(x)
|
||||
x = torch.relu(x)
|
||||
x = self.l2(x)
|
||||
x = torch.sigmoid(x)
|
||||
return x
|
||||
|
||||
|
||||
def load_set(path, col_n):
|
||||
table_set = pd.read_table(path, error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, names=col_n)
|
||||
return table_set
|
||||
|
||||
|
||||
def to_lower(t_set, header):
|
||||
a_set = t_set[header].str.lower()
|
||||
return a_set
|
||||
|
||||
|
||||
def tokenize(t_set):
|
||||
tokenized_set = [word_tokenize(content) for content in t_set]
|
||||
return tokenized_set
|
||||
|
||||
|
||||
def word_2_vec(t_set, w2v):
|
||||
c_set = [np.mean([w2v[word] for word in content if word in w2v] or [np.zeros(300)], axis=0) for content in
|
||||
t_set]
|
||||
return c_set
|
||||
|
||||
|
||||
def calc_prediction(x_t_set, batch_len, t_model):
|
||||
pred = []
|
||||
for i in range(0, len(x_t_set), batch_len):
|
||||
x_t = x_t_set[i:i + batch_len]
|
||||
x_t = torch.tensor(x_t)
|
||||
|
||||
out = t_model(x_t.float())
|
||||
|
||||
prediction = (out > 0.5)
|
||||
pred = pred + prediction.tolist()
|
||||
return pred
|
||||
|
||||
|
||||
def predict(p_model, batch_len, x_t_test):
|
||||
t_pred = []
|
||||
p_model.eval()
|
||||
with torch.no_grad():
|
||||
t_pred = calc_prediction(x_t_test, batch_len, p_model)
|
||||
|
||||
return t_pred
|
||||
|
||||
|
||||
def train_model(model_to_train, y_t_train, x_t_train):
|
||||
cri = torch.nn.BCELoss()
|
||||
opt = torch.optim.SGD(model_to_train.parameters(), lr=0.01)
|
||||
for epoch in range(6):
|
||||
model_to_train.train()
|
||||
for index in range(0, y_t_train.shape[0], BATCH_SIZE):
|
||||
t_x = x_t_train[index:index + BATCH_SIZE]
|
||||
t_x = torch.tensor(t_x)
|
||||
t_y = y_t_train[index:index + BATCH_SIZE]
|
||||
t_y = torch.tensor(t_y.astype(np.float32).to_numpy()).reshape(-1, 1)
|
||||
|
||||
out = model_to_train(t_x.float())
|
||||
loss = cri(out, t_y)
|
||||
|
||||
opt.zero_grad()
|
||||
loss.backward()
|
||||
opt.step()
|
||||
return model_to_train
|
||||
|
||||
|
||||
t_set_features = load_set(TRAIN_IN_PATH, col_names[:2])
|
||||
t_set_labels = load_set(TRAIN_EXP_PATH, col_names[2:])
|
||||
dev_set = load_set(DEV_PATH, col_names[:2])
|
||||
test_set = load_set(TEST_PATH, col_names[:2])
|
||||
|
||||
x_train = t_set_features[CONTENT].str.lower()
|
||||
y_train = t_set_labels[LABEL]
|
||||
x_dev = dev_set[CONTENT].str.lower()
|
||||
x_test = test_set[CONTENT].str.lower()
|
||||
|
||||
x_train = tokenize(x_train)
|
||||
x_dev = tokenize(x_dev)
|
||||
x_test = tokenize(x_test)
|
||||
|
||||
x_train = word_2_vec(x_train, word2vec)
|
||||
x_dev = word_2_vec(x_dev, word2vec)
|
||||
x_test = word_2_vec(x_test, word2vec)
|
||||
|
||||
model = NeuralNetwork(INPUT_SIZE, HIDDEN_SIZE, NUM_CLASSES)
|
||||
trained_model = train_model(model, y_train, x_train)
|
||||
|
||||
dev_prediction = predict(trained_model, 10, x_dev)
|
||||
test_prediction = predict(trained_model, 10, x_test)
|
||||
|
||||
trained_model.eval()
|
||||
|
||||
dev_prediction = np.asarray(dev_prediction, dtype=np.int32)
|
||||
test_prediction = np.asarray(test_prediction, dtype=np.int32)
|
||||
|
||||
dev_prediction.tofile(DEV_OUT_PATH, sep='\n')
|
||||
test_prediction.tofile(TEST_OUT_PATH, sep='\n')
|
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user