TFIDF proj commit

This commit is contained in:
Bartusiak 2020-05-05 19:28:18 +02:00
parent d53ce23da2
commit f3b94430e8
8 changed files with 200016 additions and 200009 deletions

View File

@ -99997,4 +99997,4 @@ Why is it that in the US people are just picking up random disguises that are no
5429-7980-8121 5429-7980-8121
If I was a Bayern fan I'd be pumped that a player was so enthusiastic about coming to the club that he's doing this. Start of a beautiful relationship, hopefully. If I was a Bayern fan I'd be pumped that a player was so enthusiastic about coming to the club that he's doing this. Start of a beautiful relationship, hopefully.
Honestly I enjoyed it quite a lot. Despite my getting tired of HG's, I found it to be a fun few hours. Honestly I enjoyed it quite a lot. Despite my getting tired of HG's, I found it to be a fun few hours.
I have absolutely no idea! Vine? I have absolutely no idea! Vine?

Can't render this file because it is too large.

199999
dev-0/out.tsv

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -26,7 +26,7 @@ def train():
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
x = tfidf.fit_transform(created_dictionary) x = tfidf.fit_transform(created_dictionary)
#PCA - principal component analysis #PCA - principal component analysis
pca = TruncatedSVD(n_components=100) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych pca = TruncatedSVD(n_components=120) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
x_pca = pca.fit_transform(x) x_pca = pca.fit_transform(x)
l_regression = LinearRegression() l_regression = LinearRegression()
l_regression.fit(x_pca,expected_dictionary) l_regression.fit(x_pca,expected_dictionary)

View File

@ -13,20 +13,23 @@ def predict():
input_file = open("l_regression.pkl",'rb') input_file = open("l_regression.pkl",'rb')
l_regression = pickle.load(input_file) l_regression = pickle.load(input_file)
input_file = open("tfidf_model.pkl",'rb') input_file = open("tfidf_model.pkl",'rb')
tfidf = pickle.load(input_file) tfidf = pickle.load(input_file,encoding='UTF-8')
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False) dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False, skip_blank_lines=False)
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False) testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False, skip_blank_lines=False )
devtxt = dev0["txt"] #devtxt = dev0["txt"]
testAtxt = testA["txt"] #testAtxt = testA["txt"]
print(testAtxt) #print(testAtxt)
dev0_vector = tfidf.fit_transform(devtxt)
testA_vector = tfidf.fit_transform(testAtxt)
dev0_vector = tfidf.fit_transform(dev0['txt'].apply(lambda dev0_vector: np.str_(dev0_vector)))
testA_vector = tfidf.fit_transform(testA['txt'].apply(lambda testA_vector: np.str_(testA_vector)))
#dev0_vector = tfidf.fit_transform(dev0['txt'].values.astype('U'))
#testA_vector = tfidf.fit_transform(testA['txt'].values.astype('U'))
#dev0_vector = tfidf.fit_transform(dev0['txt'],y=None)
#testA_vector = tfidf.fit_transform(testA['txt'],y=None)
#print(testA_vector) #print(testA_vector)
pca = TruncatedSVD(n_components=100) pca = TruncatedSVD(n_components=120)
dev0_pca = pca.fit_transform(dev0_vector) dev0_pca = pca.fit_transform(dev0_vector)
testA_pca = pca.fit_transform(testA_vector) testA_pca = pca.fit_transform(testA_vector)

View File

@ -99997,4 +99997,4 @@ Mine still is.
MUSTY_BALLSACK and ANAL_QUEEN's incestual offspring. MUSTY_BALLSACK and ANAL_QUEEN's incestual offspring.
Good point on the Toxicology report: came back positive for marijuana, but that's it. Good point on the Toxicology report: came back positive for marijuana, but that's it.
This sounds right initially but then you see a fly head butt the same inch of glass 300 times. Flies are not clever creatures This sounds right initially but then you see a fly head butt the same inch of glass 300 times. Flies are not clever creatures
I don't actually "need" anything right now, but that never stops me from getting mite flavours I don't actually "need" anything right now, but that never stops me from getting mite flavours

Can't render this file because it is too large.

File diff suppressed because it is too large Load Diff

Binary file not shown.