42 lines
1.6 KiB
Python
42 lines
1.6 KiB
Python
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
import numpy as np
|
||
import sklearn.metrics
|
||
from sklearn.cluster import KMeans
|
||
|
||
# Lista z https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
|
||
def remove_stop_words(document, stopwords):
|
||
return " ".join([word for word in document.split() if word not in stopwords])
|
||
|
||
def write_predictions_to_file(predictions, file):
|
||
with open(file, "w") as out_file:
|
||
for prediction in predictions:
|
||
out_file.write(str(prediction) + '\n')
|
||
|
||
def load_stop_words():
|
||
with open('stopwords.txt') as stopwords_file:
|
||
stopwords = []
|
||
for stopword in stopwords_file.readlines():
|
||
# Remove whitespaces from words, to remove all stop words
|
||
stopwords.append(stopword.strip())
|
||
return stopwords
|
||
|
||
def load_tsv_file(file):
|
||
stopwords = load_stop_words()
|
||
with open(file) as in_file:
|
||
documents = []
|
||
for document in in_file.readlines():
|
||
documents.append(remove_stop_words(document, stopwords))
|
||
return documents
|
||
|
||
def prepare_prediction(catalog):
|
||
documents = load_tsv_file(catalog + "/in.tsv")
|
||
document_vectors = TfidfVectorizer().fit_transform(documents)
|
||
|
||
# Z n_clusters w okolicy 20 uzyskiwałem wyniki pokroju 0.85 (według Geval). Jednakże, z n_clusters 50 wyniki było
|
||
# przy każdym uruchomieniu zbliżone do siebie. Przy n = 20 wachają się od 0.7 do 0.85
|
||
predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(document_vectors)
|
||
write_predictions_to_file(predictions, catalog + "/out.tsv")
|
||
|
||
prepare_prediction("dev-0")
|
||
prepare_prediction("test-A")
|