TFIDF proj commit
This commit is contained in:
parent
d53ce23da2
commit
f3b94430e8
199999
dev-0/out.tsv
199999
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
BIN
l_regression.pkl
BIN
l_regression.pkl
Binary file not shown.
@ -26,7 +26,7 @@ def train():
|
|||||||
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
|
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
|
||||||
x = tfidf.fit_transform(created_dictionary)
|
x = tfidf.fit_transform(created_dictionary)
|
||||||
#PCA - principal component analysis
|
#PCA - principal component analysis
|
||||||
pca = TruncatedSVD(n_components=100) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
|
pca = TruncatedSVD(n_components=120) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
|
||||||
x_pca = pca.fit_transform(x)
|
x_pca = pca.fit_transform(x)
|
||||||
l_regression = LinearRegression()
|
l_regression = LinearRegression()
|
||||||
l_regression.fit(x_pca,expected_dictionary)
|
l_regression.fit(x_pca,expected_dictionary)
|
||||||
|
23
predict.py
23
predict.py
@ -13,20 +13,23 @@ def predict():
|
|||||||
input_file = open("l_regression.pkl",'rb')
|
input_file = open("l_regression.pkl",'rb')
|
||||||
l_regression = pickle.load(input_file)
|
l_regression = pickle.load(input_file)
|
||||||
input_file = open("tfidf_model.pkl",'rb')
|
input_file = open("tfidf_model.pkl",'rb')
|
||||||
tfidf = pickle.load(input_file)
|
tfidf = pickle.load(input_file,encoding='UTF-8')
|
||||||
|
|
||||||
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False)
|
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False, skip_blank_lines=False)
|
||||||
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False)
|
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False, skip_blank_lines=False )
|
||||||
devtxt = dev0["txt"]
|
#devtxt = dev0["txt"]
|
||||||
testAtxt = testA["txt"]
|
#testAtxt = testA["txt"]
|
||||||
|
|
||||||
print(testAtxt)
|
#print(testAtxt)
|
||||||
|
|
||||||
dev0_vector = tfidf.fit_transform(devtxt)
|
|
||||||
testA_vector = tfidf.fit_transform(testAtxt)
|
|
||||||
|
|
||||||
|
dev0_vector = tfidf.fit_transform(dev0['txt'].apply(lambda dev0_vector: np.str_(dev0_vector)))
|
||||||
|
testA_vector = tfidf.fit_transform(testA['txt'].apply(lambda testA_vector: np.str_(testA_vector)))
|
||||||
|
#dev0_vector = tfidf.fit_transform(dev0['txt'].values.astype('U'))
|
||||||
|
#testA_vector = tfidf.fit_transform(testA['txt'].values.astype('U'))
|
||||||
|
#dev0_vector = tfidf.fit_transform(dev0['txt'],y=None)
|
||||||
|
#testA_vector = tfidf.fit_transform(testA['txt'],y=None)
|
||||||
#print(testA_vector)
|
#print(testA_vector)
|
||||||
pca = TruncatedSVD(n_components=100)
|
pca = TruncatedSVD(n_components=120)
|
||||||
|
|
||||||
dev0_pca = pca.fit_transform(dev0_vector)
|
dev0_pca = pca.fit_transform(dev0_vector)
|
||||||
testA_pca = pca.fit_transform(testA_vector)
|
testA_pca = pca.fit_transform(testA_vector)
|
||||||
|
199997
test-A/out.tsv
199997
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
BIN
tfidf_model.pkl
BIN
tfidf_model.pkl
Binary file not shown.
Loading…
Reference in New Issue
Block a user