Compare commits
No commits in common. "master" and "master" have entirely different histories.
@ -1,6 +0,0 @@
|
||||
{
|
||||
"cells": [],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
69
Bayes.py
69
Bayes.py
@ -1,69 +0,0 @@
|
||||
'''
|
||||
Autor: Dominik Strzałko
|
||||
Data: 05.08.2021
|
||||
Zadanie: naiwny bayes2 gotowa biblioteka (Skeptic vs paranormal subreddits)
|
||||
|
||||
Wyniki z geval:
|
||||
Likelihood 0.0000
|
||||
Accuracy 0.7367
|
||||
F1.0 0.4367
|
||||
Precision 0.8997
|
||||
Recall 0.2883
|
||||
'''
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
def open_tsv(tsv):
|
||||
'''
|
||||
Funkcja do zamiany plików tsv jako listy linii tekstu.
|
||||
|
||||
Na wejście potrzebuje ścieżkę do pliku .tsv
|
||||
|
||||
np. X = open_tsv("train/expected.tsv")
|
||||
'''
|
||||
with open(tsv) as f:
|
||||
return f.readlines()
|
||||
|
||||
def Create_model(X_tsv, Y_tsv):
|
||||
'''
|
||||
Funkcja przeznaczona do tworzenia modelu uczenia maszynowego.
|
||||
|
||||
Na wejście trzeba podać zbiór X_train oraz Y_train w formie plików tsv.
|
||||
|
||||
np. model = Create_model("train/in.tsv", "train/expected.tsv")
|
||||
'''
|
||||
|
||||
X = open_tsv(X_tsv)
|
||||
Y = open_tsv(Y_tsv)
|
||||
|
||||
Y = LabelEncoder().fit_transform(Y)
|
||||
pipeline = make_pipeline(TfidfVectorizer(),MultinomialNB())
|
||||
|
||||
return pipeline.fit(X, Y)
|
||||
|
||||
|
||||
def predict(model, X_tsv, file_name):
|
||||
'''
|
||||
Funkcja przeznaczona do predykcji wyników na podstawie modelu oraz zbiory X. trzecim argumentem w funkcji jest nazwa pliku z predykcjami, do zapisania na dysku.
|
||||
|
||||
np. predict(model, "dev-0/in.tsv", "dev-0/out.tsv")
|
||||
'''
|
||||
X = open_tsv(X_tsv)
|
||||
|
||||
prediction = model.predict(X)
|
||||
np.savetxt(file_name, prediction, fmt='%d')
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
model = Create_model("train/in.tsv", "train/expected.tsv")
|
||||
|
||||
predict(model, "dev-0/in.tsv", "dev-0/out.tsv")
|
||||
predict(model, "test-A/in.tsv", "test-A/out.tsv")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
112
LogReg.py
112
LogReg.py
@ -1,112 +0,0 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
from nltk.tokenize import word_tokenize
|
||||
import gensim.downloader as api
|
||||
|
||||
# Wczytanie X i Y do Train oraz X do Dev i Test
|
||||
X_train = pd.read_table('train/in.tsv', sep='\t', error_bad_lines=False, quoting=3, header=None, names=['content', 'id'], usecols=['content'])
|
||||
y_train = pd.read_table('train/expected.tsv', sep='\t', error_bad_lines=False, quoting=3, header=None, names=['label'])
|
||||
X_dev = pd.read_table('dev-0/in.tsv', sep='\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])
|
||||
X_test = pd.read_table('test-A/in.tsv', sep='\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])
|
||||
|
||||
# lowercase-ing zbiorów
|
||||
# https://www.datacamp.com/community/tutorials/case-conversion-python
|
||||
X_train = X_train.content.str.lower()
|
||||
X_dev = X_dev.content.str.lower()
|
||||
X_test = X_test.content.str.lower()
|
||||
|
||||
y_train = y_train['label'] #Df do Series?
|
||||
|
||||
# tokenizacja zbiorów
|
||||
#https://www.nltk.org/_modules/nltk/tokenize.html
|
||||
X_train = [word_tokenize(doc) for doc in X_train]
|
||||
X_dev = [word_tokenize(doc) for doc in X_dev]
|
||||
X_test = [word_tokenize(doc) for doc in X_test]
|
||||
|
||||
# word2vec zgodnie z poradą Pana Jakuba
|
||||
# https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html
|
||||
# https://www.kaggle.com/kstathou/word-embeddings-logistic-regression
|
||||
w2v = api.load('word2vec-google-news-300')
|
||||
|
||||
def document_vector(doc):
|
||||
"""Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
|
||||
return np.mean([w2v[w] for w in doc if w in w2v] or [np.zeros(300)], axis=0)
|
||||
|
||||
X_train = [document_vector(doc) for doc in X_train]
|
||||
X_dev = [document_vector(doc) for doc in X_dev]
|
||||
X_test = [document_vector(doc) for doc in X_test]
|
||||
|
||||
|
||||
#Sieć neuronowa z ćwiczeń 8
|
||||
#https://git.wmi.amu.edu.pl/filipg/aitech-eks-pub/src/branch/master/cw/08_regresja_logistyczna.ipynb
|
||||
class NeuralNetwork(torch.nn.Module):
|
||||
def __init__(self, hidden_size):
|
||||
super(NeuralNetwork, self).__init__()
|
||||
self.l1 = torch.nn.Linear(300, hidden_size) #Korzystamy z word2vec-google-news-300 który ma zawsze na wejściu wymiar 300
|
||||
self.l2 = torch.nn.Linear(hidden_size, 1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.l1(x)
|
||||
x = torch.relu(x)
|
||||
x = self.l2(x)
|
||||
x = torch.sigmoid(x)
|
||||
return x
|
||||
|
||||
model = NeuralNetwork(600)
|
||||
criterion = torch.nn.BCELoss()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)
|
||||
batch_size = 15
|
||||
|
||||
# Trening modelu z ćwiczeń 8
|
||||
#https://git.wmi.amu.edu.pl/filipg/aitech-eks-pub/src/branch/master/cw/08_regresja_logistyczna.ipynb
|
||||
for epoch in range(5):
|
||||
model.train()
|
||||
for i in range(0, y_train.shape[0], batch_size):
|
||||
X = X_train[i:i+batch_size]
|
||||
X = torch.tensor(X)
|
||||
y = y_train[i:i+batch_size]
|
||||
y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1,1)
|
||||
|
||||
outputs = model(X.float())
|
||||
loss = criterion(outputs, y)
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
y_dev = []
|
||||
y_test = []
|
||||
|
||||
#Predykcje
|
||||
#model.eval() will notify all your layers that you are in eval mode
|
||||
model.eval()
|
||||
|
||||
#torch.no_grad() impacts the autograd engine and deactivate it. It will reduce memory usage and speed up
|
||||
with torch.no_grad():
|
||||
for i in range(0, len(X_dev), batch_size):
|
||||
X = X_dev[i:i+batch_size]
|
||||
X = torch.tensor(X)
|
||||
outputs = model(X.float())
|
||||
|
||||
y = (outputs > 0.5)
|
||||
y_dev.extend(y)
|
||||
|
||||
for i in range(0, len(X_test), batch_size):
|
||||
X = X_test[i:i+batch_size]
|
||||
X = torch.tensor(X)
|
||||
outputs = model(X.float())
|
||||
|
||||
y = (outputs > 0.5)
|
||||
y_test.extend(y)
|
||||
|
||||
|
||||
#Wygenerowanie plików outputowych
|
||||
y_dev = np.asarray(y_dev, dtype=np.int32)
|
||||
y_test = np.asarray(y_test, dtype=np.int32)
|
||||
|
||||
y_dev_df = pd.DataFrame({'label':y_dev})
|
||||
y_test_df = pd.DataFrame({'label':y_test})
|
||||
|
||||
y_dev_df.to_csv(r'dev-0/out.tsv', sep='\t', index=False, header=False)
|
||||
y_test_df.to_csv(r'test-A/out.tsv', sep='\t', index=False, header=False)
|
@ -1,215 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 61,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import torch\n",
|
||||
"from nltk.tokenize import word_tokenize\n",
|
||||
"import gensim.downloader as api"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 62,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Wczytanie X i Y do Train oraz X do Dev i Test\n",
|
||||
"X_train = pd.read_table('train/in.tsv', sep='\\t', error_bad_lines=False, quoting=3, header=None, names=['content', 'id'], usecols=['content'])\n",
|
||||
"y_train = pd.read_table('train/expected.tsv', sep='\\t', error_bad_lines=False, quoting=3, header=None, names=['label'])\n",
|
||||
"X_dev = pd.read_table('dev-0/in.tsv', sep='\\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])\n",
|
||||
"X_test = pd.read_table('test-A/in.tsv', sep='\\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 63,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# lowercase-ing zbiorów\n",
|
||||
"# https://www.datacamp.com/community/tutorials/case-conversion-python\n",
|
||||
"X_train = X_train.content.str.lower()\n",
|
||||
"X_dev = X_dev.content.str.lower()\n",
|
||||
"X_test = X_test.content.str.lower()\n",
|
||||
"\n",
|
||||
"y_train = y_train['label'] #Df do Series?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 64,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# tokenizacja zbiorów\n",
|
||||
"#https://www.nltk.org/_modules/nltk/tokenize.html\n",
|
||||
"X_train = [word_tokenize(doc) for doc in X_train]\n",
|
||||
"X_dev = [word_tokenize(doc) for doc in X_dev]\n",
|
||||
"X_test = [word_tokenize(doc) for doc in X_test]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 67,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# word2vec zgodnie z poradą Pana Jakuba\n",
|
||||
"# https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html\n",
|
||||
"# https://www.kaggle.com/kstathou/word-embeddings-logistic-regression\n",
|
||||
"w2v = api.load('word2vec-google-news-300')\n",
|
||||
"\n",
|
||||
"def document_vector(doc):\n",
|
||||
" \"\"\"Create document vectors by averaging word vectors. Remove out-of-vocabulary words.\"\"\"\n",
|
||||
" return np.mean([w2v[w] for w in doc if w in w2v] or [np.zeros(300)], axis=0)\n",
|
||||
"\n",
|
||||
"X_train = [document_vector(doc) for doc in X_train]\n",
|
||||
"X_dev = [document_vector(doc) for doc in X_dev]\n",
|
||||
"X_test = [document_vector(doc) for doc in X_test]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Sieć neuronowa z ćwiczeń 8\n",
|
||||
"#https://git.wmi.amu.edu.pl/filipg/aitech-eks-pub/src/branch/master/cw/08_regresja_logistyczna.ipynb\n",
|
||||
"class NeuralNetwork(torch.nn.Module): \n",
|
||||
" def __init__(self, hidden_size):\n",
|
||||
" super(NeuralNetwork, self).__init__()\n",
|
||||
" self.l1 = torch.nn.Linear(300, hidden_size) #Korzystamy z word2vec-google-news-300 który ma zawsze na wejściu wymiar 300\n",
|
||||
" self.l2 = torch.nn.Linear(hidden_size, 1)\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" x = self.l1(x)\n",
|
||||
" x = torch.relu(x)\n",
|
||||
" x = self.l2(x)\n",
|
||||
" x = torch.sigmoid(x)\n",
|
||||
" return x"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = NeuralNetwork(600)\n",
|
||||
"\n",
|
||||
"criterion = torch.nn.BCELoss()\n",
|
||||
"optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)\n",
|
||||
"\n",
|
||||
"batch_size = 15"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Trening modelu z ćwiczeń 8\n",
|
||||
"#https://git.wmi.amu.edu.pl/filipg/aitech-eks-pub/src/branch/master/cw/08_regresja_logistyczna.ipynb\n",
|
||||
"for epoch in range(5):\n",
|
||||
" model.train()\n",
|
||||
" for i in range(0, y_train.shape[0], batch_size):\n",
|
||||
" X = X_train[i:i+batch_size]\n",
|
||||
" X = torch.tensor(X)\n",
|
||||
" y = y_train[i:i+batch_size]\n",
|
||||
" y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1,1)\n",
|
||||
"\n",
|
||||
" outputs = model(X.float())\n",
|
||||
" loss = criterion(outputs, y)\n",
|
||||
"\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 59,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_dev = []\n",
|
||||
"y_test = []\n",
|
||||
"\n",
|
||||
"#model.eval() will notify all your layers that you are in eval mode\n",
|
||||
"model.eval()\n",
|
||||
"\n",
|
||||
"#torch.no_grad() impacts the autograd engine and deactivate it. It will reduce memory usage and speed up\n",
|
||||
"with torch.no_grad():\n",
|
||||
" for i in range(0, len(X_dev), batch_size):\n",
|
||||
" X = X_dev[i:i+batch_size]\n",
|
||||
" X = torch.tensor(X)\n",
|
||||
" \n",
|
||||
" outputs = model(X.float())\n",
|
||||
" \n",
|
||||
" y = (outputs > 0.5)\n",
|
||||
" y_dev.extend(y)\n",
|
||||
"\n",
|
||||
" for i in range(0, len(X_test), batch_size):\n",
|
||||
" X = X_test[i:i+batch_size]\n",
|
||||
" X = torch.tensor(X)\n",
|
||||
"\n",
|
||||
" outputs = model(X.float())\n",
|
||||
"\n",
|
||||
" y = (outputs > 0.5)\n",
|
||||
" y_test.extend(y)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 60,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_dev = np.asarray(y_dev, dtype=np.int32)\n",
|
||||
"y_test = np.asarray(y_test, dtype=np.int32)\n",
|
||||
"\n",
|
||||
"y_dev_df = pd.DataFrame({'label':y_dev})\n",
|
||||
"y_test_df = pd.DataFrame({'label':y_test})\n",
|
||||
"\n",
|
||||
"y_dev_df.to_csv(r'dev-0/out.tsv', sep='\\t', index=False, header=False)\n",
|
||||
"y_test_df.to_csv(r'test-A/out.tsv', sep='\\t', index=False, header=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
26
README.md
26
README.md
@ -11,29 +11,3 @@ Sources
|
||||
-------
|
||||
|
||||
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
|
||||
|
||||
Results from geval (Using Naive Bayes)
|
||||
-------
|
||||
|
||||
Likelihood 0.0000
|
||||
|
||||
Accuracy 0.7367
|
||||
|
||||
F1.0 0.4367
|
||||
|
||||
Precision 0.8997
|
||||
|
||||
Recall 0.2883
|
||||
|
||||
Results from geval (Using Log Reg (NN from Pytorch))
|
||||
-------
|
||||
|
||||
Likelihood 0.0000
|
||||
|
||||
Accuracy 0.7561
|
||||
|
||||
F1.0 0.6152
|
||||
|
||||
Precision 0.6965
|
||||
|
||||
Recall 0.5509
|
@ -1,6 +0,0 @@
|
||||
Wyniki z geval:
|
||||
Likelihood 0.0000
|
||||
Accuracy 0.7367
|
||||
F1.0 0.4367
|
||||
Precision 0.8997
|
||||
Recall 0.2883
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
5272
dev-0/Bayes/out.tsv
5272
dev-0/Bayes/out.tsv
File diff suppressed because it is too large
Load Diff
5272
dev-0/in.tsv
5272
dev-0/in.tsv
File diff suppressed because one or more lines are too long
5272
dev-0/out.tsv
5272
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
5152
test-A/Bayes/out.tsv
5152
test-A/Bayes/out.tsv
File diff suppressed because it is too large
Load Diff
5152
test-A/in.tsv
5152
test-A/in.tsv
File diff suppressed because one or more lines are too long
5152
test-A/out.tsv
5152
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
289579
train/in.tsv
289579
train/in.tsv
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user