Czyszczenie kodu. Rozwiazane zadanie regresji logistycznej.

This commit is contained in:
Jan Nowak 2021-05-25 20:45:06 +02:00
parent 12d7a869ff
commit 20a4e1e709
4 changed files with 11 additions and 89 deletions

View File

@ -948,7 +948,7 @@
0 0
0 0
1 1
1 0
1 1
1 1
0 0

1 1
948 0
949 0
950 1
951 1 0
952 1
953 1
954 0

View File

@ -1,18 +0,0 @@
import gzip
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec
w2v = gensim.models.Word2Vec(vector_size=100)
w2v = Word2Vec.load("w2v.model")
#w2v.wv.save_word2vec_format('world.txt', binary=False)
#w2v.wv.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False)
#print(w2v.wv.most_similar(['gol']))
print(w2v.wv.index_to_key)

View File

@ -1,26 +0,0 @@
import gzip
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec, KeyedVectors
# train_X = []
# train_y = []
# with gzip.open('train/train.tsv.gz','r') as fin:
# for line in fin:
# sline = line.decode('UTF-8').replace("\n", "").split("\t")
# train_y.append(sline[0])
# train_X.append(''.join(sline[1:]))
# w2v = gensim.models.Word2Vec(list(train_X), vector_size=100, window=10, min_count=2, epochs=5, workers=2)
#w2v = gensim.models.Word2Vec(vector_size=100)
#w2v.wv.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False)
#w2v.wv.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
w2v.save("word2vec2.wordvectors")

View File

@ -1,18 +1,13 @@
#import numpy as np
import gzip import gzip
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import metrics from sklearn import metrics
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import gensim from gensim.models import KeyedVectors
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors
from sklearn.linear_model import LogisticRegression
import re import re
import torch import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader from torch.utils.data import TensorDataset, DataLoader
def get_str_cleaned(str_dirty): def get_str_cleaned(str_dirty):
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~' punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
@ -22,7 +17,6 @@ def get_str_cleaned(str_dirty):
new_str = new_str.replace(char,'') new_str = new_str.replace(char,'')
return new_str return new_str
#df = pd.read_csv('sport-text-classification-ball-ISI-public/train/train.tsv.gz', compression='gzip', header=None, sep='\t', error_bad_lines=False)
train_X = [] train_X = []
train_y = [] train_y = []
with gzip.open('train/train.tsv.gz','r') as fin: with gzip.open('train/train.tsv.gz','r') as fin:
@ -35,31 +29,23 @@ with gzip.open('train/train.tsv.gz','r') as fin:
train_X.append(cleared) train_X.append(cleared)
train_X_data = pd.DataFrame(train_X) train_X_data = pd.DataFrame(train_X)
#w2v = gensim.models.Word2Vec(vector_size=100) #Korpusy można pobrać z:
# #w2v.wv.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False) #http://dsmodels.nlp.ipipan.waw.pl/dsmodels/nkjp+wiki-forms-all-100-cbow-hs.txt.gz
#w2v.wv.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False) #http://dsmodels.nlp.ipipan.waw.pl/dsmodels/wiki-forms-all-100-skipg-ns.txt.gz
#w2v = Word2Vec.load("w2v.model") #w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False)
#w2v.wv.init_sims() w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
#w2v.wv.load("word2vec.wordvectors") #w2v.save("word2vec.wordvectors")
#w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False) #w2v = KeyedVectors.load("word2vec.wordvectors")
w2v = KeyedVectors.load("word2vec2.wordvectors")
#print(list(w2v.index_to_key))
def document_vector(doc): def document_vector(doc):
"""Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
#print(doc)
#doc = [word for word in doc if word in w2v.index_to_key]
try: try:
doc2 = [] doc2 = []
doc = doc.split(' ') doc = doc.split(' ')
for word in doc: for word in doc:
#print(get_str_cleaned(word))
#print(word)
#print(w2v.wv.index_to_key)
if word in w2v: if word in w2v:
doc2.append(word) doc2.append(word)
return np.mean(w2v[doc2], axis=0) return np.mean(w2v[doc2], axis=0)
except: except:
print(doc)
return np.zeros(100) return np.zeros(100)
train_X_data = train_X_data[train_X_data.columns[0]].apply(document_vector) train_X_data = train_X_data[train_X_data.columns[0]].apply(document_vector)
@ -78,8 +64,6 @@ with open('dev-0/expected.tsv','r') as dev_expected_file:
dev_X_data = pd.DataFrame(dev_X) dev_X_data = pd.DataFrame(dev_X)
dev_X_data = dev_X_data[dev_X_data.columns[0]].apply(document_vector) dev_X_data = dev_X_data[dev_X_data.columns[0]].apply(document_vector)
# X_train_vec = list(train_X_data['doc_vector'])
# X_dev_vec = list(dev_X_data['doc_vector'])
class LogisticRegressionModel(torch.nn.Module): class LogisticRegressionModel(torch.nn.Module):
@ -111,24 +95,6 @@ dev_loader = DataLoader(dataset=dev_dataset)
n_epochs = 2 n_epochs = 2
# loss_score = 0
# acc_score = 0
# items_total = 0
# for x_batch, y_batch in train_loader:
# lr_model.train()
# # Makes predictions
# yhat = lr_model(x_batch)
# # Computes loss
# loss = criterion(yhat, y_batch.unsqueeze(1))
# # Computes gradients
# loss.backward()
# # Updates parameters and zeroes gradients
# optimizer.step()
# optimizer.zero_grad()
# loss_score += loss.item() * yhat.shape[0]
# print(loss_score)
def make_train_step(model, loss_fn, optimizer): def make_train_step(model, loss_fn, optimizer):
def train_step(x, y): def train_step(x, y):
model.train() model.train()