Czyszczenie kodu. Rozwiazane zadanie regresji logistycznej.
This commit is contained in:
parent
12d7a869ff
commit
20a4e1e709
@ -948,7 +948,7 @@
|
|||||||
0
|
0
|
||||||
0
|
0
|
||||||
1
|
1
|
||||||
1
|
0
|
||||||
1
|
1
|
||||||
1
|
1
|
||||||
0
|
0
|
||||||
|
|
@ -1,18 +0,0 @@
|
|||||||
import gzip
|
|
||||||
import gensim
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import matplotlib.gridspec as gridspec
|
|
||||||
from sklearn.preprocessing import LabelEncoder
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
from gensim.models import Word2Vec
|
|
||||||
|
|
||||||
w2v = gensim.models.Word2Vec(vector_size=100)
|
|
||||||
w2v = Word2Vec.load("w2v.model")
|
|
||||||
#w2v.wv.save_word2vec_format('world.txt', binary=False)
|
|
||||||
|
|
||||||
#w2v.wv.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False)
|
|
||||||
|
|
||||||
#print(w2v.wv.most_similar(['gol']))
|
|
||||||
print(w2v.wv.index_to_key)
|
|
@ -1,26 +0,0 @@
|
|||||||
import gzip
|
|
||||||
import gensim
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import matplotlib.gridspec as gridspec
|
|
||||||
from sklearn.preprocessing import LabelEncoder
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
from gensim.models import Word2Vec, KeyedVectors
|
|
||||||
|
|
||||||
# train_X = []
|
|
||||||
# train_y = []
|
|
||||||
# with gzip.open('train/train.tsv.gz','r') as fin:
|
|
||||||
# for line in fin:
|
|
||||||
# sline = line.decode('UTF-8').replace("\n", "").split("\t")
|
|
||||||
# train_y.append(sline[0])
|
|
||||||
# train_X.append(''.join(sline[1:]))
|
|
||||||
|
|
||||||
# w2v = gensim.models.Word2Vec(list(train_X), vector_size=100, window=10, min_count=2, epochs=5, workers=2)
|
|
||||||
|
|
||||||
#w2v = gensim.models.Word2Vec(vector_size=100)
|
|
||||||
#w2v.wv.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False)
|
|
||||||
#w2v.wv.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
|
|
||||||
|
|
||||||
w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
|
|
||||||
w2v.save("word2vec2.wordvectors")
|
|
@ -1,18 +1,13 @@
|
|||||||
#import numpy as np
|
|
||||||
import gzip
|
import gzip
|
||||||
from sklearn.naive_bayes import MultinomialNB
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
|
||||||
from sklearn import metrics
|
from sklearn import metrics
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import gensim
|
from gensim.models import KeyedVectors
|
||||||
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors
|
|
||||||
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
import re
|
import re
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import Dataset, TensorDataset, DataLoader
|
from torch.utils.data import TensorDataset, DataLoader
|
||||||
|
|
||||||
def get_str_cleaned(str_dirty):
|
def get_str_cleaned(str_dirty):
|
||||||
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
|
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
|
||||||
@ -22,7 +17,6 @@ def get_str_cleaned(str_dirty):
|
|||||||
new_str = new_str.replace(char,'')
|
new_str = new_str.replace(char,'')
|
||||||
return new_str
|
return new_str
|
||||||
|
|
||||||
#df = pd.read_csv('sport-text-classification-ball-ISI-public/train/train.tsv.gz', compression='gzip', header=None, sep='\t', error_bad_lines=False)
|
|
||||||
train_X = []
|
train_X = []
|
||||||
train_y = []
|
train_y = []
|
||||||
with gzip.open('train/train.tsv.gz','r') as fin:
|
with gzip.open('train/train.tsv.gz','r') as fin:
|
||||||
@ -35,31 +29,23 @@ with gzip.open('train/train.tsv.gz','r') as fin:
|
|||||||
train_X.append(cleared)
|
train_X.append(cleared)
|
||||||
|
|
||||||
train_X_data = pd.DataFrame(train_X)
|
train_X_data = pd.DataFrame(train_X)
|
||||||
#w2v = gensim.models.Word2Vec(vector_size=100)
|
#Korpusy można pobrać z:
|
||||||
# #w2v.wv.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False)
|
#http://dsmodels.nlp.ipipan.waw.pl/dsmodels/nkjp+wiki-forms-all-100-cbow-hs.txt.gz
|
||||||
#w2v.wv.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
|
#http://dsmodels.nlp.ipipan.waw.pl/dsmodels/wiki-forms-all-100-skipg-ns.txt.gz
|
||||||
#w2v = Word2Vec.load("w2v.model")
|
#w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False)
|
||||||
#w2v.wv.init_sims()
|
w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
|
||||||
#w2v.wv.load("word2vec.wordvectors")
|
#w2v.save("word2vec.wordvectors")
|
||||||
#w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
|
#w2v = KeyedVectors.load("word2vec.wordvectors")
|
||||||
w2v = KeyedVectors.load("word2vec2.wordvectors")
|
|
||||||
#print(list(w2v.index_to_key))
|
|
||||||
def document_vector(doc):
|
def document_vector(doc):
|
||||||
"""Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
|
|
||||||
#print(doc)
|
|
||||||
#doc = [word for word in doc if word in w2v.index_to_key]
|
|
||||||
try:
|
try:
|
||||||
doc2 = []
|
doc2 = []
|
||||||
doc = doc.split(' ')
|
doc = doc.split(' ')
|
||||||
for word in doc:
|
for word in doc:
|
||||||
#print(get_str_cleaned(word))
|
|
||||||
#print(word)
|
|
||||||
#print(w2v.wv.index_to_key)
|
|
||||||
if word in w2v:
|
if word in w2v:
|
||||||
doc2.append(word)
|
doc2.append(word)
|
||||||
return np.mean(w2v[doc2], axis=0)
|
return np.mean(w2v[doc2], axis=0)
|
||||||
except:
|
except:
|
||||||
print(doc)
|
|
||||||
return np.zeros(100)
|
return np.zeros(100)
|
||||||
|
|
||||||
train_X_data = train_X_data[train_X_data.columns[0]].apply(document_vector)
|
train_X_data = train_X_data[train_X_data.columns[0]].apply(document_vector)
|
||||||
@ -78,8 +64,6 @@ with open('dev-0/expected.tsv','r') as dev_expected_file:
|
|||||||
dev_X_data = pd.DataFrame(dev_X)
|
dev_X_data = pd.DataFrame(dev_X)
|
||||||
dev_X_data = dev_X_data[dev_X_data.columns[0]].apply(document_vector)
|
dev_X_data = dev_X_data[dev_X_data.columns[0]].apply(document_vector)
|
||||||
|
|
||||||
# X_train_vec = list(train_X_data['doc_vector'])
|
|
||||||
# X_dev_vec = list(dev_X_data['doc_vector'])
|
|
||||||
|
|
||||||
class LogisticRegressionModel(torch.nn.Module):
|
class LogisticRegressionModel(torch.nn.Module):
|
||||||
|
|
||||||
@ -111,24 +95,6 @@ dev_loader = DataLoader(dataset=dev_dataset)
|
|||||||
|
|
||||||
n_epochs = 2
|
n_epochs = 2
|
||||||
|
|
||||||
# loss_score = 0
|
|
||||||
# acc_score = 0
|
|
||||||
# items_total = 0
|
|
||||||
# for x_batch, y_batch in train_loader:
|
|
||||||
# lr_model.train()
|
|
||||||
# # Makes predictions
|
|
||||||
# yhat = lr_model(x_batch)
|
|
||||||
# # Computes loss
|
|
||||||
# loss = criterion(yhat, y_batch.unsqueeze(1))
|
|
||||||
# # Computes gradients
|
|
||||||
# loss.backward()
|
|
||||||
# # Updates parameters and zeroes gradients
|
|
||||||
# optimizer.step()
|
|
||||||
# optimizer.zero_grad()
|
|
||||||
|
|
||||||
# loss_score += loss.item() * yhat.shape[0]
|
|
||||||
# print(loss_score)
|
|
||||||
|
|
||||||
def make_train_step(model, loss_fn, optimizer):
|
def make_train_step(model, loss_fn, optimizer):
|
||||||
def train_step(x, y):
|
def train_step(x, y):
|
||||||
model.train()
|
model.train()
|
||||||
|
Loading…
Reference in New Issue
Block a user