import pandas as pd from gensim.utils import simple_preprocess from gensim.parsing.porter import PorterStemmer from gensim import corpora import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch class FeedforwardNeuralNetModel(nn.Module): def __init__(self, input_dim, hidden_dim, output_dim): super(FeedforwardNeuralNetModel, self).__init__() # Linear function 1: vocab_size --> 500 self.fc1 = nn.Linear(input_dim, hidden_dim) # Non-linearity 1 self.relu1 = nn.ReLU() # Linear function 2: 500 --> 500 self.fc2 = nn.Linear(hidden_dim, hidden_dim) # Non-linearity 2 self.relu2 = nn.ReLU() # Linear function 3 (readout): 500 --> 3 self.fc3 = nn.Linear(hidden_dim, output_dim) def forward(self, x): # Linear function 1 out = self.fc1(x) # Non-linearity 1 out = self.relu1(out) # Non-linearity 2 out = self.relu2(out) # Linear function 3 (readout) out = self.fc3(out) return F.softmax(out, dim=1) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_expected = pd.read_csv('train/expected.tsv', header=None, sep='\t') train_df = pd.read_csv('train/in.tsv', header=None, sep='\t') # test_df = pd.read_csv('test-A/in.tsv', header=None, sep='\t') test_df = pd.read_csv('dev-0/in.tsv', header=None, sep='\t') y_train = pd.DataFrame(train_expected[0]) train_df[0] = [simple_preprocess(text, deacc=True) for text in train_df[0]] porter_stemmer = PorterStemmer() train_df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in train_df[0]] test_df[0] = [simple_preprocess(text, deacc=True) for text in test_df[0]] test_df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in test_df[0]] x_test = pd.DataFrame(test_df['stemmed_tokens']) x_train = pd.DataFrame(train_df['stemmed_tokens']) def make_dict(top_data_df_small, padding=True): if padding: print("Dictionary with padded token added") review_dict = corpora.Dictionary([['pad']]) review_dict.add_documents(top_data_df_small['stemmed_tokens']) else: print("Dictionary without padding") review_dict = corpora.Dictionary(top_data_df_small['stemmed_tokens']) return review_dict # Make the dictionary without padding for the basic models review_dict = make_dict(train_df, padding=False) VOCAB_SIZE = len(review_dict) NUM_LABELS = 2 # Function to make bow vector to be used as input to network def make_bow_vector(review_dict, sentence): vec = torch.zeros(VOCAB_SIZE, dtype=torch.float64, device=device) for word in sentence: vec[review_dict.token2id[word]] += 1 return vec.view(1, -1).float() input_dim = VOCAB_SIZE hidden_dim = 10 output_dim = 2 num_epochs = 2 ff_nn_bow_model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim) ff_nn_bow_model.to(device) loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(ff_nn_bow_model.parameters(), lr=0.001) losses = [] iter = 0 def make_target(label): if label == 0: return torch.tensor([0], dtype=torch.long, device=device) elif label == 1: return torch.tensor([1], dtype=torch.long, device=device) # Start training for epoch in range(num_epochs): if (epoch + 1) % 25 == 0: print("Epoch completed: " + str(epoch + 1)) print(f"Epoch number: {epoch}") train_loss = 0 for index, row in x_train.iterrows(): print(index) # Clearing the accumulated gradients optimizer.zero_grad() # Make the bag of words vector for stemmed tokens bow_vec = make_bow_vector(review_dict, row['stemmed_tokens']) # Forward pass to get output probs = ff_nn_bow_model(bow_vec) # Get the target label target = make_target(y_train[0][index]) # Calculate Loss: softmax --> cross entropy loss loss = loss_function(probs, target) # Accumulating the loss over time train_loss += loss.item() # Getting gradients w.r.t. parameters loss.backward() # Updating parameters optimizer.step() train_loss = 0 bow_ff_nn_predictions = [] original_lables_ff_bow = [] with torch.no_grad(): for index, row in x_test.iterrows(): bow_vec = make_bow_vector(review_dict, row['stemmed_tokens']) probs = ff_nn_bow_model(bow_vec) bow_ff_nn_predictions.append(torch.argmax(probs, dim=1).cpu().numpy()[0])