8.0 KiB
8.0 KiB
import numpy as np
import gensim
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
def predict_year(x, path_out, model):
results = model.predict(x)
with open(path_out, 'wt') as file:
for r in results:
file.write(str(r) + '\n')
def read_file(filename):
result = []
with open(filename, 'r', encoding="utf-8") as file:
for line in file:
text = line.split("\t")[0].strip()
result.append(text)
return result
with open('train/in.tsv', 'r', encoding='utf8') as file:
train = pd.read_csv(file, sep='\t', header=None)
with open('train/expected.tsv', 'r', encoding='utf8') as file:
train_y = pd.read_csv(file, sep='\t', header=None)
train_y = train_y[0:10000]
train_y = train_y[0]
train = train[0:10000]
train_x = train[0]
train_x = [gensim.utils.simple_preprocess(x) for x in train_x]
#train_x
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
model = Word2Vec(sentences=train_x, vector_size=100, window=5, min_count=1, workers=4)
#data, min_count = 1, vector_size = 100, window = 5, sg = 1
words = set(model.wv.index_to_key)
train_x_vec = np.array([np.array([model.wv[i] for i in x if i in words]) for x in train_x])
C:\Users\korne\AppData\Local\Temp\ipykernel_3520\3800840358.py:2: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray. train_x_vec = np.array([np.array([model.wv[i] for i in x if i in words]) for x in train_x])
FEATURES = 100
class NeuralNetworkModel(torch.nn.Module):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.fc1 = torch.nn.Linear(FEATURES,500)
self.fc2 = torch.nn.Linear(500,1)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x
nn_model = NeuralNetworkModel()
BATCH_SIZE = 40
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1)
def get_loss_acc(model, data_x, data_y):
loss_score = 0
acc_score = 0
items_total = 0
model.eval()
for i in range(0, data_y.shape[0], BATCH_SIZE):
X = data_x[i:i+BATCH_SIZE]
X = torch.tensor(X.astype(np.float32))
Y = data_y[i:i+BATCH_SIZE]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)
Y_predictions = model(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
loss = criterion(Y_predictions, Y)
loss_score += loss.item() * Y.shape[0]
return (loss_score / items_total), (acc_score / items_total)
for epoch in range(5):
loss_score = 0
acc_score = 0
items_total = 0
nn_model.train()
for i in range(0, train_y.shape[0] - 42, BATCH_SIZE):
X = train_x_vec[i:i+BATCH_SIZE]
X = torch.tensor(X.astype(np.float32))
Y = train_y[i:i+BATCH_SIZE]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)
Y_predictions = nn_model(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
optimizer.zero_grad()
loss = criterion(Y_predictions, Y)
loss.backward()
optimizer.step()
loss_score += loss.item() * Y.shape[0]
display(epoch)
display(get_loss_acc(model, train_x_vect, train_y))
#print('trenowanie modelu')
model = NeuralNetworkModel()
BATCH_SIZE = 5
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for epoch in range(BATCH_SIZE):
model.train()
for i in range(0, y_train.shape[0], BATCH_SIZE):
X = x_train[i:i + BATCH_SIZE]
X = torch.tensor(X)
y = y_train[i:i + BATCH_SIZE]
y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)
optimizer.zero_grad()
outputs = model(X.float())
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
#print('predykcja wynikow')
y_dev = []
y_test = []
model.eval()
with torch.no_grad():
for i in range(0, len(x_dev), BATCH_SIZE):
X = x_dev[i:i + BATCH_SIZE]
X = torch.tensor(X)
outputs = model(X.float())
prediction = (outputs > 0.5)
y_dev += prediction.tolist()
for i in range(0, len(x_test), BATCH_SIZE):
X = x_test[i:i + BATCH_SIZE]
X = torch.tensor(X)
outputs = model(X.float())
y = (outputs >= 0.5)
y_test += prediction.tolist()