TFIDF proj commit

This commit is contained in:
Bartusiak 2020-05-05 14:52:12 +02:00
parent 830a36db52
commit 05327feaf1
6 changed files with 120084 additions and 99860 deletions

109968
dev-0/out.tsv

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -13,14 +13,14 @@ from sklearn.linear_model import LinearRegression
def create_dictionary(in_path):
tfDict = []
max_iteration = 50000
max_iteration = 60000
i=0;
with open(in_path,encoding='utf-8') as in_file:
for line in in_file:
for word in re.findall(r"[\w]+",line):
tfDict.append(word)
i+=1
if(i>=50054):
if(i>=60014):
break
return tfDict
##
@ -32,7 +32,7 @@ def train():
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
x = tfidf.fit_transform(created_dictionary)
#PCA - principal component analysis
pca = TruncatedSVD(n_components=200) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
pca = TruncatedSVD(n_components=300) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
x_pca = pca.fit_transform(x)
l_regression = LinearRegression()
l_regression.fit(x_pca,expected_dictionary)

View File

@ -17,7 +17,7 @@ def predict():
testA_vector = tfidf.fit_transform(testA)
#print(testA_vector)
pca = TruncatedSVD(n_components=200)
pca = TruncatedSVD(n_components=300)
dev0_pca = pca.fit_transform(dev0_vector)
testA_pca = pca.fit_transform(testA_vector)

File diff suppressed because it is too large Load Diff

Binary file not shown.