Compare commits

...

12 Commits

Author SHA1 Message Date
aa41d7394e Update 'README.md' 2021-05-22 19:00:33 +02:00
Dominik
b658cdb2ed new README 2021-05-22 18:57:41 +02:00
Dominik Strzako
6dbb5168eb ".py" version of LogReg 2021-05-22 18:52:56 +02:00
Dominik Strzako
265216824e Test 3 Outputs 2021-05-22 17:07:56 +02:00
Dominik Strzako
c68b2d0d1a Test 2 Outputs 2021-05-22 16:01:18 +02:00
Dominik Strzako
68a99a2c2d Test Outputs 2021-05-22 15:05:39 +02:00
8614bc1e2f Update 'README.md' 2021-05-08 23:50:07 +02:00
df889206ae Added "Wyniki_z_geval.txt" 2021-05-08 23:47:18 +02:00
Dominik
71104493a9 Final Version (+ geval) 2021-05-08 23:02:56 +02:00
Dominik Strzako
48a3c4eace Final Version 2021-05-08 23:00:32 +02:00
Dominik
f9172f10a0 Pre-final Version 2021-05-08 22:45:55 +02:00
Dominik Strzako
3aefd799a6 Demo Version 2021-05-08 19:02:05 +02:00
18 changed files with 910987 additions and 0 deletions

View File

@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 4
}

69
Bayes.py Normal file
View File

@ -0,0 +1,69 @@
'''
Autor: Dominik Strzałko
Data: 05.08.2021
Zadanie: naiwny bayes2 gotowa biblioteka (Skeptic vs paranormal subreddits)
Wyniki z geval:
Likelihood 0.0000
Accuracy 0.7367
F1.0 0.4367
Precision 0.8997
Recall 0.2883
'''
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
def open_tsv(tsv):
'''
Funkcja do zamiany plików tsv jako listy linii tekstu.
Na wejście potrzebuje ścieżkę do pliku .tsv
np. X = open_tsv("train/expected.tsv")
'''
with open(tsv) as f:
return f.readlines()
def Create_model(X_tsv, Y_tsv):
'''
Funkcja przeznaczona do tworzenia modelu uczenia maszynowego.
Na wejście trzeba podać zbiór X_train oraz Y_train w formie plików tsv.
np. model = Create_model("train/in.tsv", "train/expected.tsv")
'''
X = open_tsv(X_tsv)
Y = open_tsv(Y_tsv)
Y = LabelEncoder().fit_transform(Y)
pipeline = make_pipeline(TfidfVectorizer(),MultinomialNB())
return pipeline.fit(X, Y)
def predict(model, X_tsv, file_name):
'''
Funkcja przeznaczona do predykcji wyników na podstawie modelu oraz zbiory X. trzecim argumentem w funkcji jest nazwa pliku z predykcjami, do zapisania na dysku.
np. predict(model, "dev-0/in.tsv", "dev-0/out.tsv")
'''
X = open_tsv(X_tsv)
prediction = model.predict(X)
np.savetxt(file_name, prediction, fmt='%d')
def main():
model = Create_model("train/in.tsv", "train/expected.tsv")
predict(model, "dev-0/in.tsv", "dev-0/out.tsv")
predict(model, "test-A/in.tsv", "test-A/out.tsv")
if __name__ == '__main__':
main()

112
LogReg.py Normal file
View File

@ -0,0 +1,112 @@
import pandas as pd
import numpy as np
import torch
from nltk.tokenize import word_tokenize
import gensim.downloader as api
# Wczytanie X i Y do Train oraz X do Dev i Test
X_train = pd.read_table('train/in.tsv', sep='\t', error_bad_lines=False, quoting=3, header=None, names=['content', 'id'], usecols=['content'])
y_train = pd.read_table('train/expected.tsv', sep='\t', error_bad_lines=False, quoting=3, header=None, names=['label'])
X_dev = pd.read_table('dev-0/in.tsv', sep='\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])
X_test = pd.read_table('test-A/in.tsv', sep='\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])
# lowercase-ing zbiorów
# https://www.datacamp.com/community/tutorials/case-conversion-python
X_train = X_train.content.str.lower()
X_dev = X_dev.content.str.lower()
X_test = X_test.content.str.lower()
y_train = y_train['label'] #Df do Series?
# tokenizacja zbiorów
#https://www.nltk.org/_modules/nltk/tokenize.html
X_train = [word_tokenize(doc) for doc in X_train]
X_dev = [word_tokenize(doc) for doc in X_dev]
X_test = [word_tokenize(doc) for doc in X_test]
# word2vec zgodnie z poradą Pana Jakuba
# https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html
# https://www.kaggle.com/kstathou/word-embeddings-logistic-regression
w2v = api.load('word2vec-google-news-300')
def document_vector(doc):
"""Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
return np.mean([w2v[w] for w in doc if w in w2v] or [np.zeros(300)], axis=0)
X_train = [document_vector(doc) for doc in X_train]
X_dev = [document_vector(doc) for doc in X_dev]
X_test = [document_vector(doc) for doc in X_test]
#Sieć neuronowa z ćwiczeń 8
#https://git.wmi.amu.edu.pl/filipg/aitech-eks-pub/src/branch/master/cw/08_regresja_logistyczna.ipynb
class NeuralNetwork(torch.nn.Module):
def __init__(self, hidden_size):
super(NeuralNetwork, self).__init__()
self.l1 = torch.nn.Linear(300, hidden_size) #Korzystamy z word2vec-google-news-300 który ma zawsze na wejściu wymiar 300
self.l2 = torch.nn.Linear(hidden_size, 1)
def forward(self, x):
x = self.l1(x)
x = torch.relu(x)
x = self.l2(x)
x = torch.sigmoid(x)
return x
model = NeuralNetwork(600)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)
batch_size = 15
# Trening modelu z ćwiczeń 8
#https://git.wmi.amu.edu.pl/filipg/aitech-eks-pub/src/branch/master/cw/08_regresja_logistyczna.ipynb
for epoch in range(5):
model.train()
for i in range(0, y_train.shape[0], batch_size):
X = X_train[i:i+batch_size]
X = torch.tensor(X)
y = y_train[i:i+batch_size]
y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1,1)
outputs = model(X.float())
loss = criterion(outputs, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
y_dev = []
y_test = []
#Predykcje
#model.eval() will notify all your layers that you are in eval mode
model.eval()
#torch.no_grad() impacts the autograd engine and deactivate it. It will reduce memory usage and speed up
with torch.no_grad():
for i in range(0, len(X_dev), batch_size):
X = X_dev[i:i+batch_size]
X = torch.tensor(X)
outputs = model(X.float())
y = (outputs > 0.5)
y_dev.extend(y)
for i in range(0, len(X_test), batch_size):
X = X_test[i:i+batch_size]
X = torch.tensor(X)
outputs = model(X.float())
y = (outputs > 0.5)
y_test.extend(y)
#Wygenerowanie plików outputowych
y_dev = np.asarray(y_dev, dtype=np.int32)
y_test = np.asarray(y_test, dtype=np.int32)
y_dev_df = pd.DataFrame({'label':y_dev})
y_test_df = pd.DataFrame({'label':y_test})
y_dev_df.to_csv(r'dev-0/out.tsv', sep='\t', index=False, header=False)
y_test_df.to_csv(r'test-A/out.tsv', sep='\t', index=False, header=False)

215
LogReg_Test.ipynb Normal file
View File

@ -0,0 +1,215 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import torch\n",
"from nltk.tokenize import word_tokenize\n",
"import gensim.downloader as api"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"# Wczytanie X i Y do Train oraz X do Dev i Test\n",
"X_train = pd.read_table('train/in.tsv', sep='\\t', error_bad_lines=False, quoting=3, header=None, names=['content', 'id'], usecols=['content'])\n",
"y_train = pd.read_table('train/expected.tsv', sep='\\t', error_bad_lines=False, quoting=3, header=None, names=['label'])\n",
"X_dev = pd.read_table('dev-0/in.tsv', sep='\\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])\n",
"X_test = pd.read_table('test-A/in.tsv', sep='\\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"# lowercase-ing zbiorów\n",
"# https://www.datacamp.com/community/tutorials/case-conversion-python\n",
"X_train = X_train.content.str.lower()\n",
"X_dev = X_dev.content.str.lower()\n",
"X_test = X_test.content.str.lower()\n",
"\n",
"y_train = y_train['label'] #Df do Series?"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"# tokenizacja zbiorów\n",
"#https://www.nltk.org/_modules/nltk/tokenize.html\n",
"X_train = [word_tokenize(doc) for doc in X_train]\n",
"X_dev = [word_tokenize(doc) for doc in X_dev]\n",
"X_test = [word_tokenize(doc) for doc in X_test]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"# word2vec zgodnie z poradą Pana Jakuba\n",
"# https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html\n",
"# https://www.kaggle.com/kstathou/word-embeddings-logistic-regression\n",
"w2v = api.load('word2vec-google-news-300')\n",
"\n",
"def document_vector(doc):\n",
" \"\"\"Create document vectors by averaging word vectors. Remove out-of-vocabulary words.\"\"\"\n",
" return np.mean([w2v[w] for w in doc if w in w2v] or [np.zeros(300)], axis=0)\n",
"\n",
"X_train = [document_vector(doc) for doc in X_train]\n",
"X_dev = [document_vector(doc) for doc in X_dev]\n",
"X_test = [document_vector(doc) for doc in X_test]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Sieć neuronowa z ćwiczeń 8\n",
"#https://git.wmi.amu.edu.pl/filipg/aitech-eks-pub/src/branch/master/cw/08_regresja_logistyczna.ipynb\n",
"class NeuralNetwork(torch.nn.Module): \n",
" def __init__(self, hidden_size):\n",
" super(NeuralNetwork, self).__init__()\n",
" self.l1 = torch.nn.Linear(300, hidden_size) #Korzystamy z word2vec-google-news-300 który ma zawsze na wejściu wymiar 300\n",
" self.l2 = torch.nn.Linear(hidden_size, 1)\n",
"\n",
" def forward(self, x):\n",
" x = self.l1(x)\n",
" x = torch.relu(x)\n",
" x = self.l2(x)\n",
" x = torch.sigmoid(x)\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"model = NeuralNetwork(600)\n",
"\n",
"criterion = torch.nn.BCELoss()\n",
"optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)\n",
"\n",
"batch_size = 15"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"# Trening modelu z ćwiczeń 8\n",
"#https://git.wmi.amu.edu.pl/filipg/aitech-eks-pub/src/branch/master/cw/08_regresja_logistyczna.ipynb\n",
"for epoch in range(5):\n",
" model.train()\n",
" for i in range(0, y_train.shape[0], batch_size):\n",
" X = X_train[i:i+batch_size]\n",
" X = torch.tensor(X)\n",
" y = y_train[i:i+batch_size]\n",
" y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1,1)\n",
"\n",
" outputs = model(X.float())\n",
" loss = criterion(outputs, y)\n",
"\n",
" optimizer.zero_grad()\n",
" loss.backward()\n",
" optimizer.step()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"y_dev = []\n",
"y_test = []\n",
"\n",
"#model.eval() will notify all your layers that you are in eval mode\n",
"model.eval()\n",
"\n",
"#torch.no_grad() impacts the autograd engine and deactivate it. It will reduce memory usage and speed up\n",
"with torch.no_grad():\n",
" for i in range(0, len(X_dev), batch_size):\n",
" X = X_dev[i:i+batch_size]\n",
" X = torch.tensor(X)\n",
" \n",
" outputs = model(X.float())\n",
" \n",
" y = (outputs > 0.5)\n",
" y_dev.extend(y)\n",
"\n",
" for i in range(0, len(X_test), batch_size):\n",
" X = X_test[i:i+batch_size]\n",
" X = torch.tensor(X)\n",
"\n",
" outputs = model(X.float())\n",
"\n",
" y = (outputs > 0.5)\n",
" y_test.extend(y)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"y_dev = np.asarray(y_dev, dtype=np.int32)\n",
"y_test = np.asarray(y_test, dtype=np.int32)\n",
"\n",
"y_dev_df = pd.DataFrame({'label':y_dev})\n",
"y_test_df = pd.DataFrame({'label':y_test})\n",
"\n",
"y_dev_df.to_csv(r'dev-0/out.tsv', sep='\\t', index=False, header=False)\n",
"y_test_df.to_csv(r'test-A/out.tsv', sep='\\t', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -11,3 +11,29 @@ Sources
------- -------
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>. Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
Results from geval (Using Naive Bayes)
-------
Likelihood 0.0000
Accuracy 0.7367
F1.0 0.4367
Precision 0.8997
Recall 0.2883
Results from geval (Using Log Reg (NN from Pytorch))
-------
Likelihood 0.0000
Accuracy 0.7561
F1.0 0.6152
Precision 0.6965
Recall 0.5509

6
Wyniki_z_geval.txt Normal file
View File

@ -0,0 +1,6 @@
Wyniki z geval:
Likelihood 0.0000
Accuracy 0.7367
F1.0 0.4367
Precision 0.8997
Recall 0.2883

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

5272
dev-0/Bayes/out.tsv Normal file

File diff suppressed because it is too large Load Diff

5272
dev-0/in.tsv Normal file

File diff suppressed because one or more lines are too long

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
geval Executable file

Binary file not shown.

5152
test-A/Bayes/out.tsv Normal file

File diff suppressed because it is too large Load Diff

5152
test-A/in.tsv Normal file

File diff suppressed because one or more lines are too long

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

289579
train/in.tsv Normal file

File diff suppressed because one or more lines are too long