model 50 epochs
This commit is contained in:
parent
d36302317c
commit
b167cca6a4
3
.gitignore
vendored
3
.gitignore
vendored
@ -7,4 +7,5 @@ data_train.csv
|
||||
data.csv
|
||||
data_not_shuf.csv
|
||||
data_not_cutted.csv
|
||||
venv
|
||||
venv
|
||||
.~lock.fake_job_postings.csv#
|
78
main.py
78
main.py
@ -10,20 +10,29 @@ from torch import nn
|
||||
from torch import optim
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def convert_text_to_model_form(text):
|
||||
a = vectorizer.transform([text])
|
||||
b = torch.tensor(scipy.sparse.csr_matrix.todense(a)).float()
|
||||
return b
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# kaggle.api.authenticate()
|
||||
# kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.',
|
||||
# unzip=True)
|
||||
kaggle.api.authenticate()
|
||||
kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='.',
|
||||
unzip=True)
|
||||
|
||||
data = pd.read_csv('fake_job_postings.csv', engine='python')
|
||||
data = data.replace(np.nan, '', regex=True)
|
||||
# data = data.replace(np.nan, '', regex=True)
|
||||
data = data[["company_profile", "fraudulent"]]
|
||||
data = data.dropna()
|
||||
|
||||
data_train, data_test = train_test_split(data, test_size=3000, random_state=1)
|
||||
data_dev, data_test = train_test_split(data_test, test_size=1500, random_state=1)
|
||||
|
||||
x_train = data_train["title"]
|
||||
x_dev = data_dev["title"]
|
||||
x_test = data_test["title"]
|
||||
x_train = data_train["company_profile"]
|
||||
x_dev = data_dev["company_profile"]
|
||||
x_test = data_test["company_profile"]
|
||||
|
||||
y_train = data_train["fraudulent"]
|
||||
y_dev = data_dev["fraudulent"]
|
||||
@ -31,27 +40,32 @@ if __name__ == "__main__":
|
||||
|
||||
x_train = np.array(x_train)
|
||||
x_dev = np.array(x_dev)
|
||||
x_test = np.array(x_test)
|
||||
|
||||
y_train = np.array(y_train)
|
||||
y_dev = np.array(y_dev)
|
||||
y_test = np.array(y_test)
|
||||
|
||||
vectorizer = TfidfVectorizer()
|
||||
|
||||
x_train = vectorizer.fit_transform(x_train)
|
||||
x_dev = vectorizer.transform(x_dev)
|
||||
x_test = vectorizer.transform(x_test)
|
||||
|
||||
x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train)).float()
|
||||
x_dev = torch.tensor(scipy.sparse.csr_matrix.todense(x_dev)).float()
|
||||
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()
|
||||
|
||||
y_train = torch.tensor(y_train)
|
||||
y_dev = torch.tensor(y_dev)
|
||||
y_test = torch.tensor(y_test)
|
||||
|
||||
from torch import nn
|
||||
|
||||
model = nn.Sequential(
|
||||
nn.Linear(x_train.shape[1], 64),
|
||||
nn.ReLU(),
|
||||
nn.Linear(64, data_train["title"].nunique()),
|
||||
nn.Linear(64, data_train["fraudulent"].nunique()),
|
||||
nn.LogSoftmax(dim=1))
|
||||
|
||||
# Define the loss
|
||||
@ -65,7 +79,7 @@ if __name__ == "__main__":
|
||||
test_losses = []
|
||||
test_accuracies = []
|
||||
|
||||
epochs = 5
|
||||
epochs = 50
|
||||
for e in range(epochs):
|
||||
optimizer.zero_grad()
|
||||
|
||||
@ -97,6 +111,50 @@ if __name__ == "__main__":
|
||||
f"Test Loss: {test_loss:.3f}.. ",
|
||||
f"Test Accuracy: {test_accuracy:.3f}")
|
||||
|
||||
TP = []
|
||||
TF = []
|
||||
|
||||
FP = []
|
||||
FN = []
|
||||
log_ps = model(x_test)
|
||||
ps = torch.exp(log_ps)
|
||||
top_p, top_class = ps.topk(1, dim=1)
|
||||
descr = np.array(data_test["company_profile"])
|
||||
for i, (x, y) in enumerate(zip(np.array(top_class), np.array(y_test.view(*top_class.shape)))):
|
||||
d = descr[i]
|
||||
if x == y:
|
||||
if x:
|
||||
TP.append(d)
|
||||
else:
|
||||
TF.append(d)
|
||||
else:
|
||||
if x:
|
||||
FP.append(d)
|
||||
else:
|
||||
FN.append(d)
|
||||
f_score = len(TP) / (len(TP) + 0.5 * (len(FP) + len(FN)))
|
||||
f = open("model_resutls.txt", "a")
|
||||
|
||||
f.write(f"F-SCORE = {f_score}\n")
|
||||
f.write(f"TP = {len(TP)}\n")
|
||||
f.write(f"TF = {len(TF)}\n")
|
||||
f.write(f"FP = {len(FP)}\n")
|
||||
f.write(f"FN = {len(FN)}\n")
|
||||
|
||||
f.write(f"TP descriptions:")
|
||||
for i in TP:
|
||||
f.write(i+'\n')
|
||||
f.write(f"TF descriptions:")
|
||||
for i in TF:
|
||||
f.write(i+"\n")
|
||||
f.write(f"FP descriptions:")
|
||||
for i in FP:
|
||||
f.write(i+"\n")
|
||||
f.write(f"FN descriptions:")
|
||||
for i in FN:
|
||||
f.write(i+"\n")
|
||||
f.close()
|
||||
|
||||
plt.figure(figsize=(12, 5))
|
||||
ax = plt.subplot(121)
|
||||
plt.xlabel('epochs')
|
||||
@ -109,5 +167,3 @@ if __name__ == "__main__":
|
||||
plt.ylabel('test accuracy')
|
||||
plt.plot(test_accuracies)
|
||||
plt.show()
|
||||
|
||||
print('Succes')
|
3010
model_resutls.txt
Normal file
3010
model_resutls.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user