Compare commits

..

No commits in common. "master" and "master" have entirely different histories.

18 changed files with 0 additions and 910987 deletions

View File

@ -1,6 +0,0 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,69 +0,0 @@
'''
Autor: Dominik Strzałko
Data: 05.08.2021
Zadanie: naiwny bayes2 gotowa biblioteka (Skeptic vs paranormal subreddits)
Wyniki z geval:
Likelihood 0.0000
Accuracy 0.7367
F1.0 0.4367
Precision 0.8997
Recall 0.2883
'''
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
def open_tsv(tsv):
'''
Funkcja do zamiany plików tsv jako listy linii tekstu.
Na wejście potrzebuje ścieżkę do pliku .tsv
np. X = open_tsv("train/expected.tsv")
'''
with open(tsv) as f:
return f.readlines()
def Create_model(X_tsv, Y_tsv):
'''
Funkcja przeznaczona do tworzenia modelu uczenia maszynowego.
Na wejście trzeba podać zbiór X_train oraz Y_train w formie plików tsv.
np. model = Create_model("train/in.tsv", "train/expected.tsv")
'''
X = open_tsv(X_tsv)
Y = open_tsv(Y_tsv)
Y = LabelEncoder().fit_transform(Y)
pipeline = make_pipeline(TfidfVectorizer(),MultinomialNB())
return pipeline.fit(X, Y)
def predict(model, X_tsv, file_name):
'''
Funkcja przeznaczona do predykcji wyników na podstawie modelu oraz zbiory X. trzecim argumentem w funkcji jest nazwa pliku z predykcjami, do zapisania na dysku.
np. predict(model, "dev-0/in.tsv", "dev-0/out.tsv")
'''
X = open_tsv(X_tsv)
prediction = model.predict(X)
np.savetxt(file_name, prediction, fmt='%d')
def main():
model = Create_model("train/in.tsv", "train/expected.tsv")
predict(model, "dev-0/in.tsv", "dev-0/out.tsv")
predict(model, "test-A/in.tsv", "test-A/out.tsv")
if __name__ == '__main__':
main()

112
LogReg.py
View File

@ -1,112 +0,0 @@
import pandas as pd
import numpy as np
import torch
from nltk.tokenize import word_tokenize
import gensim.downloader as api
# Wczytanie X i Y do Train oraz X do Dev i Test
X_train = pd.read_table('train/in.tsv', sep='\t', error_bad_lines=False, quoting=3, header=None, names=['content', 'id'], usecols=['content'])
y_train = pd.read_table('train/expected.tsv', sep='\t', error_bad_lines=False, quoting=3, header=None, names=['label'])
X_dev = pd.read_table('dev-0/in.tsv', sep='\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])
X_test = pd.read_table('test-A/in.tsv', sep='\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])
# lowercase-ing zbiorów
# https://www.datacamp.com/community/tutorials/case-conversion-python
X_train = X_train.content.str.lower()
X_dev = X_dev.content.str.lower()
X_test = X_test.content.str.lower()
y_train = y_train['label'] #Df do Series?
# tokenizacja zbiorów
#https://www.nltk.org/_modules/nltk/tokenize.html
X_train = [word_tokenize(doc) for doc in X_train]
X_dev = [word_tokenize(doc) for doc in X_dev]
X_test = [word_tokenize(doc) for doc in X_test]
# word2vec zgodnie z poradą Pana Jakuba
# https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html
# https://www.kaggle.com/kstathou/word-embeddings-logistic-regression
w2v = api.load('word2vec-google-news-300')
def document_vector(doc):
"""Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
return np.mean([w2v[w] for w in doc if w in w2v] or [np.zeros(300)], axis=0)
X_train = [document_vector(doc) for doc in X_train]
X_dev = [document_vector(doc) for doc in X_dev]
X_test = [document_vector(doc) for doc in X_test]
#Sieć neuronowa z ćwiczeń 8
#https://git.wmi.amu.edu.pl/filipg/aitech-eks-pub/src/branch/master/cw/08_regresja_logistyczna.ipynb
class NeuralNetwork(torch.nn.Module):
def __init__(self, hidden_size):
super(NeuralNetwork, self).__init__()
self.l1 = torch.nn.Linear(300, hidden_size) #Korzystamy z word2vec-google-news-300 który ma zawsze na wejściu wymiar 300
self.l2 = torch.nn.Linear(hidden_size, 1)
def forward(self, x):
x = self.l1(x)
x = torch.relu(x)
x = self.l2(x)
x = torch.sigmoid(x)
return x
model = NeuralNetwork(600)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)
batch_size = 15
# Trening modelu z ćwiczeń 8
#https://git.wmi.amu.edu.pl/filipg/aitech-eks-pub/src/branch/master/cw/08_regresja_logistyczna.ipynb
for epoch in range(5):
model.train()
for i in range(0, y_train.shape[0], batch_size):
X = X_train[i:i+batch_size]
X = torch.tensor(X)
y = y_train[i:i+batch_size]
y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1,1)
outputs = model(X.float())
loss = criterion(outputs, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
y_dev = []
y_test = []
#Predykcje
#model.eval() will notify all your layers that you are in eval mode
model.eval()
#torch.no_grad() impacts the autograd engine and deactivate it. It will reduce memory usage and speed up
with torch.no_grad():
for i in range(0, len(X_dev), batch_size):
X = X_dev[i:i+batch_size]
X = torch.tensor(X)
outputs = model(X.float())
y = (outputs > 0.5)
y_dev.extend(y)
for i in range(0, len(X_test), batch_size):
X = X_test[i:i+batch_size]
X = torch.tensor(X)
outputs = model(X.float())
y = (outputs > 0.5)
y_test.extend(y)
#Wygenerowanie plików outputowych
y_dev = np.asarray(y_dev, dtype=np.int32)
y_test = np.asarray(y_test, dtype=np.int32)
y_dev_df = pd.DataFrame({'label':y_dev})
y_test_df = pd.DataFrame({'label':y_test})
y_dev_df.to_csv(r'dev-0/out.tsv', sep='\t', index=False, header=False)
y_test_df.to_csv(r'test-A/out.tsv', sep='\t', index=False, header=False)

View File

@ -1,215 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import torch\n",
"from nltk.tokenize import word_tokenize\n",
"import gensim.downloader as api"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"# Wczytanie X i Y do Train oraz X do Dev i Test\n",
"X_train = pd.read_table('train/in.tsv', sep='\\t', error_bad_lines=False, quoting=3, header=None, names=['content', 'id'], usecols=['content'])\n",
"y_train = pd.read_table('train/expected.tsv', sep='\\t', error_bad_lines=False, quoting=3, header=None, names=['label'])\n",
"X_dev = pd.read_table('dev-0/in.tsv', sep='\\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])\n",
"X_test = pd.read_table('test-A/in.tsv', sep='\\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"# lowercase-ing zbiorów\n",
"# https://www.datacamp.com/community/tutorials/case-conversion-python\n",
"X_train = X_train.content.str.lower()\n",
"X_dev = X_dev.content.str.lower()\n",
"X_test = X_test.content.str.lower()\n",
"\n",
"y_train = y_train['label'] #Df do Series?"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"# tokenizacja zbiorów\n",
"#https://www.nltk.org/_modules/nltk/tokenize.html\n",
"X_train = [word_tokenize(doc) for doc in X_train]\n",
"X_dev = [word_tokenize(doc) for doc in X_dev]\n",
"X_test = [word_tokenize(doc) for doc in X_test]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"# word2vec zgodnie z poradą Pana Jakuba\n",
"# https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html\n",
"# https://www.kaggle.com/kstathou/word-embeddings-logistic-regression\n",
"w2v = api.load('word2vec-google-news-300')\n",
"\n",
"def document_vector(doc):\n",
" \"\"\"Create document vectors by averaging word vectors. Remove out-of-vocabulary words.\"\"\"\n",
" return np.mean([w2v[w] for w in doc if w in w2v] or [np.zeros(300)], axis=0)\n",
"\n",
"X_train = [document_vector(doc) for doc in X_train]\n",
"X_dev = [document_vector(doc) for doc in X_dev]\n",
"X_test = [document_vector(doc) for doc in X_test]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Sieć neuronowa z ćwiczeń 8\n",
"#https://git.wmi.amu.edu.pl/filipg/aitech-eks-pub/src/branch/master/cw/08_regresja_logistyczna.ipynb\n",
"class NeuralNetwork(torch.nn.Module): \n",
" def __init__(self, hidden_size):\n",
" super(NeuralNetwork, self).__init__()\n",
" self.l1 = torch.nn.Linear(300, hidden_size) #Korzystamy z word2vec-google-news-300 który ma zawsze na wejściu wymiar 300\n",
" self.l2 = torch.nn.Linear(hidden_size, 1)\n",
"\n",
" def forward(self, x):\n",
" x = self.l1(x)\n",
" x = torch.relu(x)\n",
" x = self.l2(x)\n",
" x = torch.sigmoid(x)\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"model = NeuralNetwork(600)\n",
"\n",
"criterion = torch.nn.BCELoss()\n",
"optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)\n",
"\n",
"batch_size = 15"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"# Trening modelu z ćwiczeń 8\n",
"#https://git.wmi.amu.edu.pl/filipg/aitech-eks-pub/src/branch/master/cw/08_regresja_logistyczna.ipynb\n",
"for epoch in range(5):\n",
" model.train()\n",
" for i in range(0, y_train.shape[0], batch_size):\n",
" X = X_train[i:i+batch_size]\n",
" X = torch.tensor(X)\n",
" y = y_train[i:i+batch_size]\n",
" y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1,1)\n",
"\n",
" outputs = model(X.float())\n",
" loss = criterion(outputs, y)\n",
"\n",
" optimizer.zero_grad()\n",
" loss.backward()\n",
" optimizer.step()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"y_dev = []\n",
"y_test = []\n",
"\n",
"#model.eval() will notify all your layers that you are in eval mode\n",
"model.eval()\n",
"\n",
"#torch.no_grad() impacts the autograd engine and deactivate it. It will reduce memory usage and speed up\n",
"with torch.no_grad():\n",
" for i in range(0, len(X_dev), batch_size):\n",
" X = X_dev[i:i+batch_size]\n",
" X = torch.tensor(X)\n",
" \n",
" outputs = model(X.float())\n",
" \n",
" y = (outputs > 0.5)\n",
" y_dev.extend(y)\n",
"\n",
" for i in range(0, len(X_test), batch_size):\n",
" X = X_test[i:i+batch_size]\n",
" X = torch.tensor(X)\n",
"\n",
" outputs = model(X.float())\n",
"\n",
" y = (outputs > 0.5)\n",
" y_test.extend(y)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"y_dev = np.asarray(y_dev, dtype=np.int32)\n",
"y_test = np.asarray(y_test, dtype=np.int32)\n",
"\n",
"y_dev_df = pd.DataFrame({'label':y_dev})\n",
"y_test_df = pd.DataFrame({'label':y_test})\n",
"\n",
"y_dev_df.to_csv(r'dev-0/out.tsv', sep='\\t', index=False, header=False)\n",
"y_test_df.to_csv(r'test-A/out.tsv', sep='\\t', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -11,29 +11,3 @@ Sources
-------
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
Results from geval (Using Naive Bayes)
-------
Likelihood 0.0000
Accuracy 0.7367
F1.0 0.4367
Precision 0.8997
Recall 0.2883
Results from geval (Using Log Reg (NN from Pytorch))
-------
Likelihood 0.0000
Accuracy 0.7561
F1.0 0.6152
Precision 0.6965
Recall 0.5509

View File

@ -1,6 +0,0 @@
Wyniki z geval:
Likelihood 0.0000
Accuracy 0.7367
F1.0 0.4367
Precision 0.8997
Recall 0.2883

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

BIN
geval

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

289579
train/in.tsv

File diff suppressed because one or more lines are too long