TFIDF proj commit
This commit is contained in:
parent
830a36db52
commit
05327feaf1
109968
dev-0/out.tsv
109968
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
BIN
l_regression.pkl
BIN
l_regression.pkl
Binary file not shown.
@ -13,14 +13,14 @@ from sklearn.linear_model import LinearRegression
|
||||
|
||||
def create_dictionary(in_path):
|
||||
tfDict = []
|
||||
max_iteration = 50000
|
||||
max_iteration = 60000
|
||||
i=0;
|
||||
with open(in_path,encoding='utf-8') as in_file:
|
||||
for line in in_file:
|
||||
for word in re.findall(r"[\w]+",line):
|
||||
tfDict.append(word)
|
||||
i+=1
|
||||
if(i>=50054):
|
||||
if(i>=60014):
|
||||
break
|
||||
return tfDict
|
||||
##
|
||||
@ -32,7 +32,7 @@ def train():
|
||||
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
|
||||
x = tfidf.fit_transform(created_dictionary)
|
||||
#PCA - principal component analysis
|
||||
pca = TruncatedSVD(n_components=200) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
|
||||
pca = TruncatedSVD(n_components=300) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
|
||||
x_pca = pca.fit_transform(x)
|
||||
l_regression = LinearRegression()
|
||||
l_regression.fit(x_pca,expected_dictionary)
|
||||
|
@ -17,7 +17,7 @@ def predict():
|
||||
testA_vector = tfidf.fit_transform(testA)
|
||||
|
||||
#print(testA_vector)
|
||||
pca = TruncatedSVD(n_components=200)
|
||||
pca = TruncatedSVD(n_components=300)
|
||||
|
||||
dev0_pca = pca.fit_transform(dev0_vector)
|
||||
testA_pca = pca.fit_transform(testA_vector)
|
||||
|
109968
test-A/out.tsv
109968
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
BIN
tfidf_model.pkl
BIN
tfidf_model.pkl
Binary file not shown.
Loading…
Reference in New Issue
Block a user