TFIDF proj commit

This commit is contained in:
Bartusiak 2020-05-05 19:28:18 +02:00
parent d53ce23da2
commit f3b94430e8
8 changed files with 200016 additions and 200009 deletions

199999
dev-0/out.tsv

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -26,7 +26,7 @@ def train():
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
x = tfidf.fit_transform(created_dictionary) x = tfidf.fit_transform(created_dictionary)
#PCA - principal component analysis #PCA - principal component analysis
pca = TruncatedSVD(n_components=100) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych pca = TruncatedSVD(n_components=120) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
x_pca = pca.fit_transform(x) x_pca = pca.fit_transform(x)
l_regression = LinearRegression() l_regression = LinearRegression()
l_regression.fit(x_pca,expected_dictionary) l_regression.fit(x_pca,expected_dictionary)

View File

@ -13,20 +13,23 @@ def predict():
input_file = open("l_regression.pkl",'rb') input_file = open("l_regression.pkl",'rb')
l_regression = pickle.load(input_file) l_regression = pickle.load(input_file)
input_file = open("tfidf_model.pkl",'rb') input_file = open("tfidf_model.pkl",'rb')
tfidf = pickle.load(input_file) tfidf = pickle.load(input_file,encoding='UTF-8')
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False) dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False, skip_blank_lines=False)
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False) testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False, skip_blank_lines=False )
devtxt = dev0["txt"] #devtxt = dev0["txt"]
testAtxt = testA["txt"] #testAtxt = testA["txt"]
print(testAtxt) #print(testAtxt)
dev0_vector = tfidf.fit_transform(devtxt)
testA_vector = tfidf.fit_transform(testAtxt)
dev0_vector = tfidf.fit_transform(dev0['txt'].apply(lambda dev0_vector: np.str_(dev0_vector)))
testA_vector = tfidf.fit_transform(testA['txt'].apply(lambda testA_vector: np.str_(testA_vector)))
#dev0_vector = tfidf.fit_transform(dev0['txt'].values.astype('U'))
#testA_vector = tfidf.fit_transform(testA['txt'].values.astype('U'))
#dev0_vector = tfidf.fit_transform(dev0['txt'],y=None)
#testA_vector = tfidf.fit_transform(testA['txt'],y=None)
#print(testA_vector) #print(testA_vector)
pca = TruncatedSVD(n_components=100) pca = TruncatedSVD(n_components=120)
dev0_pca = pca.fit_transform(dev0_vector) dev0_pca = pca.fit_transform(dev0_vector)
testA_pca = pca.fit_transform(testA_vector) testA_pca = pca.fit_transform(testA_vector)

File diff suppressed because it is too large Load Diff

Binary file not shown.