#import numpy as np import gzip from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn import metrics import pandas as pd import numpy as np import gensim from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors from sklearn.linear_model import LogisticRegression import re import torch from torch.utils.data import Dataset, TensorDataset, DataLoader def get_str_cleaned(str_dirty): punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~' new_str = str_dirty.lower() new_str = re.sub(' +', ' ', new_str) for char in punctuation: new_str = new_str.replace(char,'') return new_str #df = pd.read_csv('sport-text-classification-ball-ISI-public/train/train.tsv.gz', compression='gzip', header=None, sep='\t', error_bad_lines=False) train_X = [] train_y = [] with gzip.open('train/train.tsv.gz','r') as fin: for line in fin: sline = line.decode('UTF-8').replace("\n", "").split("\t") cleared = get_str_cleaned(''.join(sline[1:])) if len(cleared)>0: train_y.append(int(sline[0])) train_X.append(cleared) train_X_data = pd.DataFrame(train_X) #w2v = gensim.models.Word2Vec(vector_size=100) # #w2v.wv.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False) #w2v.wv.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False) #w2v = Word2Vec.load("w2v.model") #w2v.wv.init_sims() #w2v.wv.load("word2vec.wordvectors") #w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False) w2v = KeyedVectors.load("word2vec2.wordvectors") #print(list(w2v.index_to_key)) def document_vector(doc): """Create document vectors by averaging word vectors. Remove out-of-vocabulary words.""" #print(doc) #doc = [word for word in doc if word in w2v.index_to_key] try: doc2 = [] doc = doc.split(' ') for word in doc: #print(get_str_cleaned(word)) #print(word) #print(w2v.wv.index_to_key) if word in w2v: doc2.append(word) return np.mean(w2v[doc2], axis=0) except: print(doc) return np.zeros(100) train_X_data = train_X_data[train_X_data.columns[0]].apply(document_vector) dev_X = [] with open('dev-0/in.tsv','r') as dev_in_file: for line in dev_in_file: dev_X.append(get_str_cleaned(line.rstrip('\n'))) dev_y = [] with open('dev-0/expected.tsv','r') as dev_expected_file: for line in dev_expected_file: dev_y.append(int(line.rstrip('\n'))) dev_X_data = pd.DataFrame(dev_X) dev_X_data = dev_X_data[dev_X_data.columns[0]].apply(document_vector) # X_train_vec = list(train_X_data['doc_vector']) # X_dev_vec = list(dev_X_data['doc_vector']) class LogisticRegressionModel(torch.nn.Module): def __init__(self): super(LogisticRegressionModel, self).__init__() self.fc = torch.nn.Linear(100,1) def forward(self, x): x = self.fc(x) x = torch.sigmoid(x) return x lr_model = LogisticRegressionModel() criterion = torch.nn.BCELoss() optimizer = torch.optim.SGD(lr_model.parameters(), lr = 0.1) train_x_tensor = torch.tensor(train_X_data).float() train_y_tensor = torch.tensor(train_y).float() train_dataset = TensorDataset(train_x_tensor, train_y_tensor) train_loader = DataLoader(dataset=train_dataset) dev_x_tensor = torch.tensor(dev_X_data).float() dev_y_tensor = torch.tensor(dev_y).float() dev_dataset = TensorDataset(dev_x_tensor, dev_y_tensor) dev_loader = DataLoader(dataset=dev_dataset) n_epochs = 2 # loss_score = 0 # acc_score = 0 # items_total = 0 # for x_batch, y_batch in train_loader: # lr_model.train() # # Makes predictions # yhat = lr_model(x_batch) # # Computes loss # loss = criterion(yhat, y_batch.unsqueeze(1)) # # Computes gradients # loss.backward() # # Updates parameters and zeroes gradients # optimizer.step() # optimizer.zero_grad() # loss_score += loss.item() * yhat.shape[0] # print(loss_score) def make_train_step(model, loss_fn, optimizer): def train_step(x, y): model.train() yhat = model(x) loss = loss_fn(yhat, y.unsqueeze(1)) loss.backward() optimizer.step() optimizer.zero_grad() return loss.item() return train_step train_step = make_train_step(lr_model, criterion, optimizer) training_losses = [] validation_losses = [] for epoch in range(n_epochs): y_pred = [] losses = [] for x_batch, y_batch in train_loader: loss = train_step(x_batch, y_batch) losses.append(loss) training_loss = np.mean(losses) training_losses.append(training_loss) #Evaluation with torch.no_grad(): val_losses = [] for x_val, y_val in dev_loader: lr_model.eval() yhat = lr_model(x_val) y_pred.append(int(yhat.item() > 0.5)) val_loss = criterion(yhat, y_val.unsqueeze(1)) val_losses.append(val_loss.item()) validation_loss = np.mean(val_losses) validation_losses.append(validation_loss) print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}") score1 = metrics.accuracy_score(dev_y, y_pred) print("accuracy: %0.5f" % score1) file = open('dev-0/out.tsv',"w") for i in y_pred: file.writelines("{}\n".format(i)) file.close() test_X = [] with open('test-A/in.tsv','r') as test_in_file: for line in test_in_file: test_X.append(get_str_cleaned(line.rstrip('\n'))) test_X_data = pd.DataFrame(test_X) test_X_data = test_X_data[test_X_data.columns[0]].apply(document_vector) test_x_tensor = torch.tensor(test_X_data).float() val_y_pred = [] with torch.no_grad(): for x_val in test_x_tensor: lr_model.eval() yhat = lr_model(x_val) val_y_pred.append(int(yhat.item() > 0.5)) file = open('test-A/out.tsv',"w") for i in val_y_pred: file.writelines("{}\n".format(i)) file.close()