Compare commits
3 Commits
Author | SHA1 | Date | |
---|---|---|---|
20a4e1e709 | |||
12d7a869ff | |||
43d80423a4 |
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
165
rlrozwiazanie.py
Normal file
165
rlrozwiazanie.py
Normal file
@ -0,0 +1,165 @@
|
|||||||
|
import gzip
|
||||||
|
from sklearn import metrics
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gensim.models import KeyedVectors
|
||||||
|
|
||||||
|
import re
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import TensorDataset, DataLoader
|
||||||
|
|
||||||
|
def get_str_cleaned(str_dirty):
|
||||||
|
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
|
||||||
|
new_str = str_dirty.lower()
|
||||||
|
new_str = re.sub(' +', ' ', new_str)
|
||||||
|
for char in punctuation:
|
||||||
|
new_str = new_str.replace(char,'')
|
||||||
|
return new_str
|
||||||
|
|
||||||
|
train_X = []
|
||||||
|
train_y = []
|
||||||
|
with gzip.open('train/train.tsv.gz','r') as fin:
|
||||||
|
for line in fin:
|
||||||
|
sline = line.decode('UTF-8').replace("\n", "").split("\t")
|
||||||
|
cleared = get_str_cleaned(''.join(sline[1:]))
|
||||||
|
|
||||||
|
if len(cleared)>0:
|
||||||
|
train_y.append(int(sline[0]))
|
||||||
|
train_X.append(cleared)
|
||||||
|
|
||||||
|
train_X_data = pd.DataFrame(train_X)
|
||||||
|
#Korpusy można pobrać z:
|
||||||
|
#http://dsmodels.nlp.ipipan.waw.pl/dsmodels/nkjp+wiki-forms-all-100-cbow-hs.txt.gz
|
||||||
|
#http://dsmodels.nlp.ipipan.waw.pl/dsmodels/wiki-forms-all-100-skipg-ns.txt.gz
|
||||||
|
#w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False)
|
||||||
|
w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
|
||||||
|
#w2v.save("word2vec.wordvectors")
|
||||||
|
#w2v = KeyedVectors.load("word2vec.wordvectors")
|
||||||
|
|
||||||
|
def document_vector(doc):
|
||||||
|
try:
|
||||||
|
doc2 = []
|
||||||
|
doc = doc.split(' ')
|
||||||
|
for word in doc:
|
||||||
|
if word in w2v:
|
||||||
|
doc2.append(word)
|
||||||
|
return np.mean(w2v[doc2], axis=0)
|
||||||
|
except:
|
||||||
|
return np.zeros(100)
|
||||||
|
|
||||||
|
train_X_data = train_X_data[train_X_data.columns[0]].apply(document_vector)
|
||||||
|
|
||||||
|
dev_X = []
|
||||||
|
with open('dev-0/in.tsv','r') as dev_in_file:
|
||||||
|
for line in dev_in_file:
|
||||||
|
dev_X.append(get_str_cleaned(line.rstrip('\n')))
|
||||||
|
|
||||||
|
dev_y = []
|
||||||
|
with open('dev-0/expected.tsv','r') as dev_expected_file:
|
||||||
|
for line in dev_expected_file:
|
||||||
|
dev_y.append(int(line.rstrip('\n')))
|
||||||
|
|
||||||
|
|
||||||
|
dev_X_data = pd.DataFrame(dev_X)
|
||||||
|
dev_X_data = dev_X_data[dev_X_data.columns[0]].apply(document_vector)
|
||||||
|
|
||||||
|
|
||||||
|
class LogisticRegressionModel(torch.nn.Module):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(LogisticRegressionModel, self).__init__()
|
||||||
|
self.fc = torch.nn.Linear(100,1)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.fc(x)
|
||||||
|
x = torch.sigmoid(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
lr_model = LogisticRegressionModel()
|
||||||
|
|
||||||
|
criterion = torch.nn.BCELoss()
|
||||||
|
optimizer = torch.optim.SGD(lr_model.parameters(), lr = 0.1)
|
||||||
|
|
||||||
|
train_x_tensor = torch.tensor(train_X_data).float()
|
||||||
|
train_y_tensor = torch.tensor(train_y).float()
|
||||||
|
|
||||||
|
train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
|
||||||
|
train_loader = DataLoader(dataset=train_dataset)
|
||||||
|
|
||||||
|
dev_x_tensor = torch.tensor(dev_X_data).float()
|
||||||
|
dev_y_tensor = torch.tensor(dev_y).float()
|
||||||
|
|
||||||
|
dev_dataset = TensorDataset(dev_x_tensor, dev_y_tensor)
|
||||||
|
dev_loader = DataLoader(dataset=dev_dataset)
|
||||||
|
|
||||||
|
n_epochs = 2
|
||||||
|
|
||||||
|
def make_train_step(model, loss_fn, optimizer):
|
||||||
|
def train_step(x, y):
|
||||||
|
model.train()
|
||||||
|
yhat = model(x)
|
||||||
|
loss = loss_fn(yhat, y.unsqueeze(1))
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
optimizer.zero_grad()
|
||||||
|
return loss.item()
|
||||||
|
return train_step
|
||||||
|
|
||||||
|
train_step = make_train_step(lr_model, criterion, optimizer)
|
||||||
|
training_losses = []
|
||||||
|
validation_losses = []
|
||||||
|
|
||||||
|
for epoch in range(n_epochs):
|
||||||
|
y_pred = []
|
||||||
|
losses = []
|
||||||
|
for x_batch, y_batch in train_loader:
|
||||||
|
loss = train_step(x_batch, y_batch)
|
||||||
|
losses.append(loss)
|
||||||
|
training_loss = np.mean(losses)
|
||||||
|
training_losses.append(training_loss)
|
||||||
|
|
||||||
|
#Evaluation
|
||||||
|
with torch.no_grad():
|
||||||
|
val_losses = []
|
||||||
|
for x_val, y_val in dev_loader:
|
||||||
|
lr_model.eval()
|
||||||
|
yhat = lr_model(x_val)
|
||||||
|
y_pred.append(int(yhat.item() > 0.5))
|
||||||
|
val_loss = criterion(yhat, y_val.unsqueeze(1))
|
||||||
|
val_losses.append(val_loss.item())
|
||||||
|
validation_loss = np.mean(val_losses)
|
||||||
|
validation_losses.append(validation_loss)
|
||||||
|
print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}")
|
||||||
|
|
||||||
|
|
||||||
|
score1 = metrics.accuracy_score(dev_y, y_pred)
|
||||||
|
print("accuracy: %0.5f" % score1)
|
||||||
|
|
||||||
|
|
||||||
|
file = open('dev-0/out.tsv',"w")
|
||||||
|
for i in y_pred:
|
||||||
|
file.writelines("{}\n".format(i))
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
|
test_X = []
|
||||||
|
with open('test-A/in.tsv','r') as test_in_file:
|
||||||
|
for line in test_in_file:
|
||||||
|
test_X.append(get_str_cleaned(line.rstrip('\n')))
|
||||||
|
|
||||||
|
test_X_data = pd.DataFrame(test_X)
|
||||||
|
test_X_data = test_X_data[test_X_data.columns[0]].apply(document_vector)
|
||||||
|
test_x_tensor = torch.tensor(test_X_data).float()
|
||||||
|
|
||||||
|
val_y_pred = []
|
||||||
|
with torch.no_grad():
|
||||||
|
for x_val in test_x_tensor:
|
||||||
|
lr_model.eval()
|
||||||
|
yhat = lr_model(x_val)
|
||||||
|
val_y_pred.append(int(yhat.item() > 0.5))
|
||||||
|
|
||||||
|
file = open('test-A/out.tsv',"w")
|
||||||
|
for i in val_y_pred:
|
||||||
|
file.writelines("{}\n".format(i))
|
||||||
|
file.close()
|
67
rozwiazanie.py
Normal file
67
rozwiazanie.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
#import numpy as np
|
||||||
|
import gzip
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||||
|
from sklearn import metrics
|
||||||
|
|
||||||
|
#df = pd.read_csv('sport-text-classification-ball-ISI-public/train/train.tsv.gz', compression='gzip', header=None, sep='\t', error_bad_lines=False)
|
||||||
|
train_X = []
|
||||||
|
train_y = []
|
||||||
|
with gzip.open('train/train.tsv.gz','r') as fin:
|
||||||
|
for line in fin:
|
||||||
|
sline = line.decode('UTF-8').replace("\n", "").split("\t")
|
||||||
|
train_y.append(sline[0])
|
||||||
|
train_X.append(''.join(sline[1:]))
|
||||||
|
|
||||||
|
test_X = []
|
||||||
|
with open('dev-0/in.tsv','r') as test_in_file:
|
||||||
|
for line in test_in_file:
|
||||||
|
test_X.append(line.rstrip('\n'))
|
||||||
|
|
||||||
|
test_y = []
|
||||||
|
with open('dev-0/expected.tsv','r') as test_expected_file:
|
||||||
|
for line in test_expected_file:
|
||||||
|
test_y.append(line.rstrip('\n'))
|
||||||
|
|
||||||
|
vectorizer = TfidfVectorizer(lowercase = True)
|
||||||
|
X_train_tf = vectorizer.fit_transform(train_X)
|
||||||
|
print("n_samples: %d, n_features: %d" % X_train_tf.shape)
|
||||||
|
|
||||||
|
X_test_tf = vectorizer.transform(test_X)
|
||||||
|
print("n_samples: %d, n_features: %d" % X_test_tf.shape)
|
||||||
|
|
||||||
|
naive_bayes_classifier = MultinomialNB()
|
||||||
|
naive_bayes_classifier.fit(X_train_tf, train_y)
|
||||||
|
|
||||||
|
y_pred = naive_bayes_classifier.predict(X_test_tf)
|
||||||
|
|
||||||
|
score1 = metrics.accuracy_score(test_y, y_pred)
|
||||||
|
print("accuracy: %0.3f" % score1)
|
||||||
|
|
||||||
|
print(metrics.classification_report(test_y, y_pred,
|
||||||
|
target_names=['1', '0']))
|
||||||
|
|
||||||
|
print("confusion matrix:")
|
||||||
|
print(metrics.confusion_matrix(test_y, y_pred))
|
||||||
|
|
||||||
|
print('------------------------------')
|
||||||
|
|
||||||
|
file = open('dev-0/out.tsv',"w")
|
||||||
|
for i in y_pred:
|
||||||
|
file.writelines("{}\n".format(i))
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
val_X = []
|
||||||
|
with open('test-A/in.tsv','r') as test_in_file:
|
||||||
|
for line in test_in_file:
|
||||||
|
val_X.append(line.rstrip('\n'))
|
||||||
|
|
||||||
|
X_val_tf = vectorizer.transform(val_X)
|
||||||
|
print("n_samples: %d, n_features: %d" % X_val_tf.shape)
|
||||||
|
|
||||||
|
val_y_pred = naive_bayes_classifier.predict(X_val_tf)
|
||||||
|
|
||||||
|
file = open('test-A/out.tsv',"w")
|
||||||
|
for i in val_y_pred:
|
||||||
|
file.writelines("{}\n".format(i))
|
||||||
|
file.close()
|
5447
test-A/out.tsv
Normal file
5447
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user