Compare commits

..

No commits in common. "master" and "master" have entirely different histories.

10 changed files with 0 additions and 311135 deletions

View File

@ -1,139 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"import sklearn\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import numpy as np\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.preprocessing import LabelEncoder "
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"def getInput(path):\n",
" with open(path,encoding='utf-8') as f:\n",
" return f.readlines()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/c/Users/mkoci/Desktop/naiwny_bayes\n"
]
}
],
"source": [
"!pwd"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"train_in=getInput('./train/in.tsv')\n",
"train_expected=getInput('./train/expected.tsv')\n",
"test_in=getInput('./test-A/in.tsv')\n",
"dev_in=getInput('./dev-0/in.tsv')\n",
"dev_expected=getInput('./dev-0/expected.tsv')"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"pipeline = make_pipeline(TfidfVectorizer(),MultinomialNB())\n",
"encTransform = LabelEncoder().fit_transform(train_expected)\n",
"model = pipeline.fit(train_in, encTransform)\n",
"dev_predicted = model.predict(dev_in)\n",
"test_predicted = model.predict(test_in)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"with open('./dev-0/out.tsv', \"w\") as result:\n",
" for out in dev_predicted:\n",
" result.write(str(out) + '\\n')\n",
"with open('./test-A/out.tsv', \"w\") as result:\n",
" for out in test_predicted:\n",
" result.write(str(out) + '\\n') "
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[NbConvertApp] Converting notebook Naiwny_bayes.ipynb to script\n",
"[NbConvertApp] Writing 1337 bytes to Naiwny_bayes.py\n"
]
}
],
"source": [
"!jupyter nbconvert --to script Naiwny_bayes.ipynb"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,76 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# In[46]:
import sklearn
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
# In[47]:
def getInput(path):
with open(path,encoding='utf-8') as f:
return f.readlines()
# In[48]:
get_ipython().system('pwd')
# In[49]:
train_in=getInput('./train/in.tsv')
train_expected=getInput('./train/expected.tsv')
test_in=getInput('./test-A/in.tsv')
dev_in=getInput('./dev-0/in.tsv')
dev_expected=getInput('./dev-0/expected.tsv')
# In[50]:
pipeline = make_pipeline(TfidfVectorizer(),MultinomialNB())
encTransform = LabelEncoder().fit_transform(train_expected)
model = pipeline.fit(train_in, encTransform)
dev_predicted = model.predict(dev_in)
test_predicted = model.predict(test_in)
# In[ ]:
# In[54]:
with open('./dev-0/out.tsv', "w") as result:
for out in dev_predicted:
result.write(str(out) + '\n')
with open('./test-A/out.tsv', "w") as result:
for out in test_predicted:
result.write(str(out) + '\n')
# In[55]:
get_ipython().system('jupyter nbconvert --to script Naiwny_bayes.ipynb')
# In[ ]:

View File

@ -1,155 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def getInput(path):\n",
" with open(path,encoding='utf-8') as f:\n",
" return f.readlines()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import gensim.downloader as gensim\n",
"import numpy as np\n",
"import pandas as pd\n",
"import torch\n",
"from nltk.tokenize import word_tokenize"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"word2vec = gensim.load('word2vec-google-news-300')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# train_in=getInput('./train/in.tsv')\n",
"# train_expected=getInput('./train/expected.tsv')\n",
"# test_in=getInput('./test-A/in.tsv')\n",
"# dev_in=getInput('./dev-0/in.tsv')\n",
"# dev_expected=getInput('./dev-0/expected.tsv')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"class NeuralNetworkModel(torch.nn.Module):\n",
" def __init__(self):\n",
" super(NeuralNetworkModel, self).__init__()\n",
" self.l01 = torch.nn.Linear(300, 300)\n",
" self.l02 = torch.nn.Linear(300, 1)\n",
"\n",
" def forward(self, x):\n",
" x = self.l01(x)\n",
" x = torch.relu(x)\n",
" x = self.l02(x)\n",
" x = torch.sigmoid(x)\n",
" return x\n",
"\n",
"def d2v(doc):\n",
" return np.mean([word2vec[word] for word in doc if word in word2vec] or [np.zeros(300)], axis=0)\n",
"x_train = pd.read_table('train/in.tsv.xz', compression='xz', sep='\\t', header=None, error_bad_lines=False, quoting=3)\n",
"x_train = x_train[0].str.lower()\n",
"x_dev = pd.read_table('dev-0/in.tsv.xz', compression='xz', sep='\\t', header=None, quoting=3)\n",
"x_dev = x_dev[0].str.lower()\n",
"x_test = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\\t', header=None, quoting=3)\n",
"x_test = x_test[0].str.lower()\n",
"y_train = pd.read_table('train/expected.tsv', sep='\\t', header=None, quoting=3)\n",
"y_train = y_train[0]\n",
"x_train = [word_tokenize(x) for x in x_train]\n",
"x_dev = [word_tokenize(x) for x in x_dev]\n",
"x_test = [word_tokenize(x) for x in x_test]\n",
"x_train = [d2v(doc) for doc in x_train]\n",
"x_dev = [d2v(doc) for doc in x_dev]\n",
"x_test = [d2v(doc) for doc in x_test]\n",
"model = NeuralNetworkModel()\n",
"BATCH_SIZE = 10\n",
"criterion = torch.nn.BCELoss()\n",
"optimizer = torch.optim.Adam(model.parameters())\n",
"for epoch in range(BATCH_SIZE):\n",
" model.train()\n",
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
" X = x_train[i:i + BATCH_SIZE]\n",
" X = torch.tensor(X)\n",
" y = y_train[i:i + BATCH_SIZE]\n",
" y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n",
" optimizer.zero_grad()\n",
" outputs = model(X.float())\n",
" loss = criterion(outputs, y)\n",
" loss.backward()\n",
" optimizer.step()\n",
"y_dev = []\n",
"y_test = []\n",
"model.eval()\n",
"with torch.no_grad():\n",
" for i in range(0, len(x_dev), BATCH_SIZE):\n",
" X = x_dev[i:i + BATCH_SIZE]\n",
" X = torch.tensor(X)\n",
" outputs = model(X.float())\n",
" y = (outputs > 0.5)\n",
" y_dev.extend(y)\n",
"\n",
" for i in range(0, len(x_test), BATCH_SIZE):\n",
" X = x_test[i:i + BATCH_SIZE]\n",
" X = torch.tensor(X)\n",
" outputs = model(X.float())\n",
" y = (outputs >= 0.5)\n",
" y_test.extend(y)\n",
"\n",
"y_dev = np.asarray(y_dev, dtype=np.int32)\n",
"Y_dev = pd.DataFrame({'label': y_dev})\n",
"y_test = np.asarray(y_test, dtype=np.int32)\n",
"Y_test = pd.DataFrame({'label': y_test})\n",
"Y_dev.to_csv(r'dev-0/out.tsv', sep='\\t', index=False, header=False)\n",
"Y_test.to_csv(r'test-A/out.tsv', sep='\\t', index=False, header=False)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,116 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# In[2]:
def getInput(path):
with open(path,encoding='utf-8') as f:
return f.readlines()
# In[6]:
import gensim.downloader as gensim
import numpy as np
import pandas as pd
import torch
from nltk.tokenize import word_tokenize
# In[8]:
word2vec = gensim.load('word2vec-google-news-300')
# In[5]:
# train_in=getInput('./train/in.tsv')
# train_expected=getInput('./train/expected.tsv')
# test_in=getInput('./test-A/in.tsv')
# dev_in=getInput('./dev-0/in.tsv')
# dev_expected=getInput('./dev-0/expected.tsv')
# In[14]:
class NeuralNetworkModel(torch.nn.Module):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.l01 = torch.nn.Linear(300, 300)
self.l02 = torch.nn.Linear(300, 1)
def forward(self, x):
x = self.l01(x)
x = torch.relu(x)
x = self.l02(x)
x = torch.sigmoid(x)
return x
def d2v(doc):
return np.mean([word2vec[word] for word in doc if word in word2vec] or [np.zeros(300)], axis=0)
x_train = pd.read_table('train/in.tsv.xz', compression='xz', sep='\t', header=None, error_bad_lines=False, quoting=3)
x_train = x_train[0].str.lower()
x_dev = pd.read_table('dev-0/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)
x_dev = x_dev[0].str.lower()
x_test = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)
x_test = x_test[0].str.lower()
y_train = pd.read_table('train/expected.tsv', sep='\t', header=None, quoting=3)
y_train = y_train[0]
x_train = [word_tokenize(x) for x in x_train]
x_dev = [word_tokenize(x) for x in x_dev]
x_test = [word_tokenize(x) for x in x_test]
x_train = [d2v(doc) for doc in x_train]
x_dev = [d2v(doc) for doc in x_dev]
x_test = [d2v(doc) for doc in x_test]
model = NeuralNetworkModel()
BATCH_SIZE = 10
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())
for epoch in range(BATCH_SIZE):
model.train()
for i in range(0, y_train.shape[0], BATCH_SIZE):
X = x_train[i:i + BATCH_SIZE]
X = torch.tensor(X)
y = y_train[i:i + BATCH_SIZE]
y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)
optimizer.zero_grad()
outputs = model(X.float())
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
y_dev = []
y_test = []
model.eval()
with torch.no_grad():
for i in range(0, len(x_dev), BATCH_SIZE):
X = x_dev[i:i + BATCH_SIZE]
X = torch.tensor(X)
outputs = model(X.float())
y = (outputs > 0.5)
y_dev.extend(y)
for i in range(0, len(x_test), BATCH_SIZE):
X = x_test[i:i + BATCH_SIZE]
X = torch.tensor(X)
outputs = model(X.float())
y = (outputs >= 0.5)
y_test.extend(y)
y_dev = np.asarray(y_dev, dtype=np.int32)
Y_dev = pd.DataFrame({'label': y_dev})
y_test = np.asarray(y_test, dtype=np.int32)
Y_test = pd.DataFrame({'label': y_test})
Y_dev.to_csv(r'dev-0/out.tsv', sep='\t', index=False, header=False)
Y_test.to_csv(r'test-A/out.tsv', sep='\t', index=False, header=False)
# In[ ]:

View File

@ -1,222 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import csv
# In[2]:
get_ipython().system('pip install gensim')
# In[17]:
import nltk
nltk.download('punkt')
# In[9]:
get_ipython().system('pip install nltk')
# In[3]:
get_ipython().system('pip install torch')
# In[4]:
import gensim.downloader
import numpy as np
import pandas as pd
import torch
# In[5]:
import torch.nn as nn
from nltk import word_tokenize
# In[13]:
header_names = ["content", "id", "label"]
# In[23]:
class FF(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super(FF, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, hidden_dim)
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
out = self.fc1(x)
out = self.relu1(out)
out = self.relu2(out)
out = self.fc3(out)
return torch.sigmoid(out)
train_set_labels = pd.read_table(
"train/expected.tsv",
error_bad_lines=False,
quoting=csv.QUOTE_NONE,
header=None,
names=header_names[2:],
)
train_set_features = pd.read_table(
"train/in.tsv.xz",
error_bad_lines=False,
quoting=csv.QUOTE_NONE,
header=None,
names=header_names[:2],
)
test_set = pd.read_table(
"test-A/in.tsv.xz",
error_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
names=header_names[:2],
)
dev_set = pd.read_table(
"dev-0/in.tsv.xz",
error_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
names=header_names[:2],
)
X_train = train_set_features["content"].str.lower()
y_train = train_set_labels["label"]
X_dev = dev_set["content"].str.lower()
X_test = test_set["content"].str.lower()
X_train = [word_tokenize(content) for content in X_train]
X_dev = [word_tokenize(content) for content in X_dev]
X_test = [word_tokenize(content) for content in X_test]
word2vec = gensim.downloader.load("word2vec-google-news-300")
# In[24]:
X_train = [
np.mean(
[word2vec[word] for word in content if word in word2vec] or [np.zeros(300)],
axis=0,
)
for content in X_train
]
X_dev = [
np.mean(
[word2vec[word] for word in content if word in word2vec] or [np.zeros(300)],
axis=0,
)
for content in X_dev
]
X_test = [
np.mean(
[word2vec[word] for word in content if word in word2vec] or [np.zeros(300)],
axis=0,
)
for content in X_test
]
hidden_layer = 650
epochs = 15
batch_size = 10
# In[27]:
output_dim = 1
input_dim =300
model = FF(input_dim, hidden_layer, output_dim)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = torch.nn.BCELoss()
# In[28]:
for epoch in range(epochs):
model.train()
for i in range(0, y_train.shape[0], batch_size):
X = X_train[i : i + batch_size]
X = torch.tensor(X)
y = y_train[i : i + batch_size]
y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)
outputs = model(X.float())
loss = criterion(outputs, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
test_prediction = []
dev_prediction = []
model.eval()
with torch.no_grad():
for i in range(0, len(X_test), batch_size):
X = X_test[i : i + batch_size]
X = torch.tensor(X)
outputs = model(X.float())
prediction = outputs > 0.5
test_prediction += prediction.tolist()
for i in range(0, len(X_dev), batch_size):
X = X_dev[i : i + batch_size]
X = torch.tensor(X)
outputs = model(X.float())
prediction = outputs > 0.5
dev_prediction += prediction.tolist()
test_prediction = np.asarray(test_prediction, dtype=np.int32)
dev_prediction = np.asarray(dev_prediction, dtype=np.int32)
test_prediction.tofile("./test-A/out.tsv", sep="\n")
dev_prediction.tofile("./dev-0/out.tsv", sep="\n")
# In[ ]:
# In[ ]:
# In[ ]:

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

289579
train/in.tsv

File diff suppressed because one or more lines are too long