From 20a4e1e70907d8a6cc0c4f7d37f3f694bb97cf08 Mon Sep 17 00:00:00 2001 From: Jan Nowak Date: Tue, 25 May 2021 20:45:06 +0200 Subject: [PATCH] Czyszczenie kodu. Rozwiazane zadanie regresji logistycznej. --- dev-0/out.tsv | 2 +- mlrlrozwiazanie.py | 18 ---------------- mrlrozwiazanie.py | 26 ---------------------- rlrozwiazanie.py | 54 +++++++++------------------------------------- 4 files changed, 11 insertions(+), 89 deletions(-) delete mode 100644 mlrlrozwiazanie.py delete mode 100644 mrlrozwiazanie.py diff --git a/dev-0/out.tsv b/dev-0/out.tsv index ee6c023..b3669cb 100644 --- a/dev-0/out.tsv +++ b/dev-0/out.tsv @@ -948,7 +948,7 @@ 0 0 1 -1 +0 1 1 0 diff --git a/mlrlrozwiazanie.py b/mlrlrozwiazanie.py deleted file mode 100644 index 39562c9..0000000 --- a/mlrlrozwiazanie.py +++ /dev/null @@ -1,18 +0,0 @@ -import gzip -import gensim -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import matplotlib.gridspec as gridspec -from sklearn.preprocessing import LabelEncoder -from sklearn.linear_model import LogisticRegression -from gensim.models import Word2Vec - -w2v = gensim.models.Word2Vec(vector_size=100) -w2v = Word2Vec.load("w2v.model") -#w2v.wv.save_word2vec_format('world.txt', binary=False) - -#w2v.wv.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False) - -#print(w2v.wv.most_similar(['gol'])) -print(w2v.wv.index_to_key) \ No newline at end of file diff --git a/mrlrozwiazanie.py b/mrlrozwiazanie.py deleted file mode 100644 index 867e17e..0000000 --- a/mrlrozwiazanie.py +++ /dev/null @@ -1,26 +0,0 @@ -import gzip -import gensim -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import matplotlib.gridspec as gridspec -from sklearn.preprocessing import LabelEncoder -from sklearn.linear_model import LogisticRegression -from gensim.models import Word2Vec, KeyedVectors - -# train_X = [] -# train_y = [] -# with gzip.open('train/train.tsv.gz','r') as fin: -# for line in fin: -# sline = line.decode('UTF-8').replace("\n", "").split("\t") -# train_y.append(sline[0]) -# train_X.append(''.join(sline[1:])) - -# w2v = gensim.models.Word2Vec(list(train_X), vector_size=100, window=10, min_count=2, epochs=5, workers=2) - -#w2v = gensim.models.Word2Vec(vector_size=100) -#w2v.wv.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False) -#w2v.wv.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False) - -w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False) -w2v.save("word2vec2.wordvectors") \ No newline at end of file diff --git a/rlrozwiazanie.py b/rlrozwiazanie.py index 0b2ca02..c437a0a 100644 --- a/rlrozwiazanie.py +++ b/rlrozwiazanie.py @@ -1,18 +1,13 @@ -#import numpy as np import gzip -from sklearn.naive_bayes import MultinomialNB -from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn import metrics import pandas as pd import numpy as np -import gensim -from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors +from gensim.models import KeyedVectors -from sklearn.linear_model import LogisticRegression import re import torch -from torch.utils.data import Dataset, TensorDataset, DataLoader +from torch.utils.data import TensorDataset, DataLoader def get_str_cleaned(str_dirty): punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~' @@ -22,7 +17,6 @@ def get_str_cleaned(str_dirty): new_str = new_str.replace(char,'') return new_str -#df = pd.read_csv('sport-text-classification-ball-ISI-public/train/train.tsv.gz', compression='gzip', header=None, sep='\t', error_bad_lines=False) train_X = [] train_y = [] with gzip.open('train/train.tsv.gz','r') as fin: @@ -35,31 +29,23 @@ with gzip.open('train/train.tsv.gz','r') as fin: train_X.append(cleared) train_X_data = pd.DataFrame(train_X) -#w2v = gensim.models.Word2Vec(vector_size=100) -# #w2v.wv.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False) -#w2v.wv.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False) -#w2v = Word2Vec.load("w2v.model") -#w2v.wv.init_sims() -#w2v.wv.load("word2vec.wordvectors") -#w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False) -w2v = KeyedVectors.load("word2vec2.wordvectors") -#print(list(w2v.index_to_key)) +#Korpusy można pobrać z: +#http://dsmodels.nlp.ipipan.waw.pl/dsmodels/nkjp+wiki-forms-all-100-cbow-hs.txt.gz +#http://dsmodels.nlp.ipipan.waw.pl/dsmodels/wiki-forms-all-100-skipg-ns.txt.gz +#w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False) +w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False) +#w2v.save("word2vec.wordvectors") +#w2v = KeyedVectors.load("word2vec.wordvectors") + def document_vector(doc): - """Create document vectors by averaging word vectors. Remove out-of-vocabulary words.""" - #print(doc) - #doc = [word for word in doc if word in w2v.index_to_key] try: doc2 = [] doc = doc.split(' ') for word in doc: - #print(get_str_cleaned(word)) - #print(word) - #print(w2v.wv.index_to_key) if word in w2v: doc2.append(word) return np.mean(w2v[doc2], axis=0) except: - print(doc) return np.zeros(100) train_X_data = train_X_data[train_X_data.columns[0]].apply(document_vector) @@ -78,8 +64,6 @@ with open('dev-0/expected.tsv','r') as dev_expected_file: dev_X_data = pd.DataFrame(dev_X) dev_X_data = dev_X_data[dev_X_data.columns[0]].apply(document_vector) -# X_train_vec = list(train_X_data['doc_vector']) -# X_dev_vec = list(dev_X_data['doc_vector']) class LogisticRegressionModel(torch.nn.Module): @@ -111,24 +95,6 @@ dev_loader = DataLoader(dataset=dev_dataset) n_epochs = 2 -# loss_score = 0 -# acc_score = 0 -# items_total = 0 -# for x_batch, y_batch in train_loader: -# lr_model.train() -# # Makes predictions -# yhat = lr_model(x_batch) -# # Computes loss -# loss = criterion(yhat, y_batch.unsqueeze(1)) -# # Computes gradients -# loss.backward() -# # Updates parameters and zeroes gradients -# optimizer.step() -# optimizer.zero_grad() - -# loss_score += loss.item() * yhat.shape[0] -# print(loss_score) - def make_train_step(model, loss_fn, optimizer): def train_step(x, y): model.train()