NN with on value

This commit is contained in:
Mikołaj Pokrywka 2022-04-23 13:52:09 +02:00
parent a8b9ffb939
commit d36302317c
21 changed files with 283 additions and 28 deletions

1
.gitignore vendored
View File

@ -7,3 +7,4 @@ data_train.csv
data.csv
data_not_shuf.csv
data_not_cutted.csv
venv

8
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,34 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="7">
<item index="0" class="java.lang.String" itemvalue="pl-core-news-sm" />
<item index="1" class="java.lang.String" itemvalue="en-core-web-sm" />
<item index="2" class="java.lang.String" itemvalue="livocat-core" />
<item index="3" class="java.lang.String" itemvalue="tqdm" />
<item index="4" class="java.lang.String" itemvalue="spacy" />
<item index="5" class="java.lang.String" itemvalue="streamlit" />
<item index="6" class="java.lang.String" itemvalue="requests" />
</list>
</value>
</option>
</inspection_tool>
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N802" />
</list>
</option>
</inspection_tool>
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredIdentifiers">
<list>
<option value="translation_handler.fairseq_translation.FairseqTransferer" />
</list>
</option>
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

8
.idea/ium_444463.iml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.8 (ium_444463)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (ium_444463)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/ium_444463.iml" filepath="$PROJECT_DIR$/.idea/ium_444463.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

113
main.py Normal file
View File

@ -0,0 +1,113 @@
import pandas as pd
import numpy as np
import scipy
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import kaggle
from sklearn.feature_extraction.text import TfidfVectorizer
from torch import nn
from torch import optim
import matplotlib.pyplot as plt
if __name__ == "__main__":
# kaggle.api.authenticate()
# kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.',
# unzip=True)
data = pd.read_csv('fake_job_postings.csv', engine='python')
data = data.replace(np.nan, '', regex=True)
data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
x_train = data_train["title"]
x_dev = data_dev["title"]
x_test = data_test["title"]
y_train = data_train["fraudulent"]
y_dev = data_dev["fraudulent"]
y_test = data_test["fraudulent"]
x_train = np.array(x_train)
x_dev = np.array(x_dev)
y_train = np.array(y_train)
y_dev = np.array(y_dev)
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_dev = vectorizer.transform(x_dev)
x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train)).float()
x_dev = torch.tensor(scipy.sparse.csr_matrix.todense(x_dev)).float()
y_train = torch.tensor(y_train)
y_dev = torch.tensor(y_dev)
from torch import nn
model = nn.Sequential(
nn.Linear(x_train.shape[1], 64),
nn.ReLU(),
nn.Linear(64, data_train["title"].nunique()),
nn.LogSoftmax(dim=1))
# Define the loss
criterion = nn.NLLLoss() # Forward pass, log
logps = model(x_train) # Calculate the loss with the logits and the labels
loss = criterion(logps, y_train)
loss.backward() # Optimizers need parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.002)
train_losses = []
test_losses = []
test_accuracies = []
epochs = 5
for e in range(epochs):
optimizer.zero_grad()
output = model.forward(x_train)
loss = criterion(output, y_train)
loss.backward()
train_loss = loss.item()
train_losses.append(train_loss)
optimizer.step()
# Turn off gradients for validation, saves memory and computations
with torch.no_grad():
model.eval()
log_ps = model(x_dev)
test_loss = criterion(log_ps, y_dev)
test_losses.append(test_loss)
ps = torch.exp(log_ps)
top_p, top_class = ps.topk(1, dim=1)
equals = top_class == y_dev.view(*top_class.shape)
test_accuracy = torch.mean(equals.float())
test_accuracies.append(test_accuracy)
model.train()
print(f"Epoch: {e + 1}/{epochs}.. ",
f"Training Loss: {train_loss:.3f}.. ",
f"Test Loss: {test_loss:.3f}.. ",
f"Test Accuracy: {test_accuracy:.3f}")
plt.figure(figsize=(12, 5))
ax = plt.subplot(121)
plt.xlabel('epochs')
plt.ylabel('negative log likelihood loss')
plt.plot(train_losses, label='Training loss')
plt.plot(test_losses, label='Validation loss')
plt.legend(frameon=False)
plt.subplot(122)
plt.xlabel('epochs')
plt.ylabel('test accuracy')
plt.plot(test_accuracies)
plt.show()
print('Succes')

View File

@ -1,3 +1,7 @@
pandas
numpy
kaggle
torch
matplotlib
sklearn
scipy

View File

@ -4,7 +4,11 @@
"cell_type": "code",
"execution_count": 28,
"id": "5e2107a5",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"#Skrypt do ściagnięcia zbiory danych\n"
@ -14,7 +18,11 @@
"cell_type": "code",
"execution_count": 29,
"id": "bcc889e5",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@ -29,14 +37,14 @@
"Requirement already satisfied: python-slugify in /home/students/s444463/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n",
"Requirement already satisfied: python-dateutil in /usr/lib/python3/dist-packages (from kaggle) (2.7.3)\n",
"Requirement already satisfied: text-unidecode>=1.3 in /home/students/s444463/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n",
"\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
"\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n",
"Requirement already satisfied: pandas in /usr/lib/python3/dist-packages (0.25.3)\n",
"\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
"\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n",
"Requirement already satisfied: numpy in /usr/lib/python3/dist-packages (1.17.4)\n",
"\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
"\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n"
]
}
],
@ -50,7 +58,11 @@
"cell_type": "code",
"execution_count": 30,
"id": "02a4034f",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@ -70,7 +82,11 @@
"cell_type": "code",
"execution_count": 31,
"id": "5035aef0",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@ -88,7 +104,11 @@
"cell_type": "code",
"execution_count": 32,
"id": "14344d2f",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@ -105,8 +125,8 @@
"Requirement already satisfied: kiwisolver>=1.0.1 in /home/students/s444463/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.2)\n",
"Requirement already satisfied: python-dateutil>=2.7 in /usr/lib/python3/dist-packages (from matplotlib>=2.2->seaborn) (2.7.3)\n",
"Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.14.0)\n",
"\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
"\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n"
]
}
],
@ -118,7 +138,11 @@
"cell_type": "code",
"execution_count": 33,
"id": "0f5ebfab",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
@ -534,7 +558,11 @@
"cell_type": "code",
"execution_count": 34,
"id": "edbf49da",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@ -553,7 +581,11 @@
"cell_type": "code",
"execution_count": 35,
"id": "e60b3f32",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
@ -585,7 +617,11 @@
"cell_type": "code",
"execution_count": 36,
"id": "ddb2fc38",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
@ -1001,7 +1037,11 @@
"cell_type": "code",
"execution_count": 37,
"id": "c5ac75f5",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
@ -1373,7 +1413,11 @@
"cell_type": "code",
"execution_count": 38,
"id": "4b0e77a4",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
@ -1399,7 +1443,11 @@
"cell_type": "code",
"execution_count": 39,
"id": "5a1d8ec7",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@ -1411,8 +1459,8 @@
"Requirement already satisfied: threadpoolctl>=2.0.0 in /home/students/s444463/.local/lib/python3.8/site-packages (from scikit-learn) (3.1.0)\n",
"Requirement already satisfied: joblib>=0.11 in /usr/lib/python3/dist-packages (from scikit-learn) (0.14.0)\n",
"Requirement already satisfied: scipy>=1.1.0 in /usr/lib/python3/dist-packages (from scikit-learn) (1.3.3)\n",
"\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
"\u001B[33mWARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
@ -1425,7 +1473,11 @@
"cell_type": "code",
"execution_count": 40,
"id": "50813795",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
@ -1461,7 +1513,11 @@
"cell_type": "code",
"execution_count": 41,
"id": "ea3c9f2e",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@ -1483,7 +1539,11 @@
"cell_type": "code",
"execution_count": 42,
"id": "b20cc27a",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {

View File

@ -0,0 +1,3 @@
pandas
numpy
kaggle