Compare commits
No commits in common. "master" and "master" have entirely different histories.
8
.idea/.gitignore
vendored
8
.idea/.gitignore
vendored
@ -1,8 +0,0 @@
|
|||||||
# Default ignored files
|
|
||||||
/shelf/
|
|
||||||
/workspace.xml
|
|
||||||
# Datasource local storage ignored files
|
|
||||||
/../../../../../:\Projects\PycharmProjects\paranormal-or-skeptic-ISI-public\.idea/dataSources/
|
|
||||||
/dataSources.local.xml
|
|
||||||
# Editor-based HTTP Client requests
|
|
||||||
/httpRequests/
|
|
@ -1,6 +0,0 @@
|
|||||||
<component name="InspectionProjectProfileManager">
|
|
||||||
<settings>
|
|
||||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
|
||||||
<version value="1.0" />
|
|
||||||
</settings>
|
|
||||||
</component>
|
|
@ -1,4 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (base)" project-jdk-type="Python SDK" />
|
|
||||||
</project>
|
|
@ -1,8 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="ProjectModuleManager">
|
|
||||||
<modules>
|
|
||||||
<module fileurl="file://$PROJECT_DIR$/.idea/paranormal-or-skeptic-ISI-public.iml" filepath="$PROJECT_DIR$/.idea/paranormal-or-skeptic-ISI-public.iml" />
|
|
||||||
</modules>
|
|
||||||
</component>
|
|
||||||
</project>
|
|
@ -1,8 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<module type="PYTHON_MODULE" version="4">
|
|
||||||
<component name="NewModuleRootManager">
|
|
||||||
<content url="file://$MODULE_DIR$" />
|
|
||||||
<orderEntry type="jdk" jdkName="Python 3.8 (base)" jdkType="Python SDK" />
|
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
|
||||||
</component>
|
|
||||||
</module>
|
|
@ -1,6 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="VcsDirectoryMappings">
|
|
||||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
|
||||||
</component>
|
|
||||||
</project>
|
|
132
classificator.py
132
classificator.py
@ -1,132 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import torch
|
|
||||||
import gensim
|
|
||||||
import gensim.downloader as api
|
|
||||||
from sklearn.feature_extraction.text import HashingVectorizer
|
|
||||||
from sklearn.metrics import accuracy_score
|
|
||||||
|
|
||||||
|
|
||||||
def load_train_data():
|
|
||||||
data = pd.read_csv("train/in.tsv.xz", sep='\t', names=['text', 'id'], nrows=30000)
|
|
||||||
data = data.drop(columns=['id'])
|
|
||||||
labels_df = pd.read_csv("train/expected.tsv", sep='\t', names=['label'], nrows=30000)
|
|
||||||
labels = labels_df['label'].values
|
|
||||||
return data, labels
|
|
||||||
|
|
||||||
|
|
||||||
def load_test_data():
|
|
||||||
data = pd.read_csv("test-A/in.tsv.xz", sep='\t', names=['text', 'id'])
|
|
||||||
data = data.drop(columns=['id'])
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def load_dev_data():
|
|
||||||
data = pd.read_csv("dev-0/in.tsv.xz", sep='\t', names=['text', 'id'])
|
|
||||||
data = data.drop(columns=['id'])
|
|
||||||
labels_df = pd.read_csv("dev-0/expected.tsv", sep='\t', names=['label'])
|
|
||||||
labels = labels_df['label'].values
|
|
||||||
return data, labels
|
|
||||||
|
|
||||||
|
|
||||||
class NeuralNetworkModel(torch.nn.Module):
|
|
||||||
def __init__(self, features):
|
|
||||||
super(NeuralNetworkModel, self).__init__()
|
|
||||||
self.fc1 = torch.nn.Linear(features, 500)
|
|
||||||
self.fc2 = torch.nn.Linear(500, 1)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
x = self.fc1(x)
|
|
||||||
x = torch.relu(x)
|
|
||||||
x = self.fc2(x)
|
|
||||||
x = torch.sigmoid(x)
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
# def tokenize(doc):
|
|
||||||
# doc_splited = doc.split(" ")
|
|
||||||
# doc_tokenized = [list(set(gensim.utils.tokenize(x, lowercase=True))) for x in doc_splited]
|
|
||||||
# doc_tokenized_str = ""
|
|
||||||
# print(doc_tokenized)
|
|
||||||
# for word in doc_tokenized:
|
|
||||||
# doc_tokenized_str += word[0]
|
|
||||||
# doc_tokenized += " "
|
|
||||||
# print(doc_tokenized_str)
|
|
||||||
# return doc_tokenized_str
|
|
||||||
|
|
||||||
|
|
||||||
# def document_vector(doc):
|
|
||||||
# """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
|
|
||||||
# doc = [word for word in doc if word in w2v.key_to_index]
|
|
||||||
# return np.mean(w2v[doc], axis=0)
|
|
||||||
|
|
||||||
|
|
||||||
def train_model(model, X, Y, batch_size=5, epoch_amount=5):
|
|
||||||
for epoch in range(epoch_amount):
|
|
||||||
loss_score = 0
|
|
||||||
acc_score = 0
|
|
||||||
items_total = 0
|
|
||||||
model.train()
|
|
||||||
for i in range(0, Y.shape[0], batch_size):
|
|
||||||
X_step = X[i:i + batch_size]
|
|
||||||
X_step = torch.tensor(X_step.astype(np.float32).todense())
|
|
||||||
Y_step = Y[i:i + batch_size]
|
|
||||||
Y_step = torch.tensor(Y_step.astype(np.float32)).reshape(-1, 1)
|
|
||||||
Y_predictions = model(X_step)
|
|
||||||
acc_score += torch.sum((Y_predictions > 0.5) == Y_step).item()
|
|
||||||
items_total += Y_step.shape[0]
|
|
||||||
optimizer.zero_grad()
|
|
||||||
loss = criterion(Y_predictions, Y_step)
|
|
||||||
loss.backward()
|
|
||||||
optimizer.step()
|
|
||||||
loss_score += loss.item() * Y_step.shape[0]
|
|
||||||
print("epoch: ", epoch+1, "/", epoch_amount)
|
|
||||||
return (loss_score / items_total), (acc_score / items_total)
|
|
||||||
|
|
||||||
|
|
||||||
def test_model(model, X):
|
|
||||||
model.eval()
|
|
||||||
X = torch.tensor(X.astype(np.float32).todense())
|
|
||||||
Y_raw = model(X)
|
|
||||||
Y = [1 if x > 0.5 else 0 for x in Y_raw.detach().numpy()]
|
|
||||||
return Y
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# loading and prepearing data
|
|
||||||
# w2v = api.load('fasttext-wiki-news-subwords-300')
|
|
||||||
print("Loading data...")
|
|
||||||
data, Y = load_train_data()
|
|
||||||
FEATURES = 20000
|
|
||||||
BATCH = 5
|
|
||||||
EPOCHES = 5
|
|
||||||
|
|
||||||
# text vectorization
|
|
||||||
print("Vectorizing text data...")
|
|
||||||
vectorizer = HashingVectorizer(n_features=FEATURES)
|
|
||||||
X = vectorizer.fit_transform(data['text'].values)
|
|
||||||
#X = []
|
|
||||||
# for doc in data['text'].values:
|
|
||||||
# X.append(document_vector(tokenize(doc)))
|
|
||||||
# X = np.asarray(X)
|
|
||||||
# print(X[:5])
|
|
||||||
# train model
|
|
||||||
print("Training model...")
|
|
||||||
nn_model = NeuralNetworkModel(FEATURES)
|
|
||||||
criterion = torch.nn.BCELoss()
|
|
||||||
optimizer = torch.optim.SGD(nn_model.parameters(), lr=0.1)
|
|
||||||
train_model(nn_model, X, Y, BATCH, EPOCHES)
|
|
||||||
|
|
||||||
# test model
|
|
||||||
print("Testing model...")
|
|
||||||
data_dev, Y_dev_exp = load_dev_data()
|
|
||||||
X_dev = vectorizer.transform(data_dev['text'].values)
|
|
||||||
Y_dev_pred = test_model(nn_model, X_dev)
|
|
||||||
# acc = accuracy_score(Y_dev_exp, Y_dev_pred)
|
|
||||||
# print("dev accuracy: ", acc)
|
|
||||||
np.savetxt("dev-0/out.tsv", Y_dev_pred, fmt='%i', delimiter="\t")
|
|
||||||
|
|
||||||
data_test = load_test_data()
|
|
||||||
X_test = vectorizer.transform(data_test['text'].values)
|
|
||||||
Y_test_pred = test_model(nn_model, X_test)
|
|
||||||
np.savetxt("test-A/out.tsv", Y_test_pred, fmt='%i', delimiter="\t")
|
|
5272
dev-0/out.tsv
5272
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
5152
test-A/out.tsv
5152
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user