This commit is contained in:
Jakub Kolasiński 2021-04-19 20:00:40 +02:00
parent 918dda14d2
commit 587bc42691
2 changed files with 5470 additions and 29 deletions

View File

@ -1,11 +1,5 @@
import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler, normalize
scaler = MinMaxScaler()
stopwords = [] stopwords = []
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt # stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
@ -13,6 +7,15 @@ with open('stopwords') as f:
stopwords = [line.rstrip() for line in f] stopwords = [line.rstrip() for line in f]
classifier = MultinomialNB() classifier = MultinomialNB()
vectorizer = TfidfVectorizer()
def preprocess(doc):
doc = doc.lower().split(' ')
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
doc = ' '.join(doc)
return doc
def train(): def train():
with open('train/train.tsv') as f: with open('train/train.tsv') as f:
@ -20,37 +23,28 @@ def train():
docs_preprocessed = [] docs_preprocessed = []
y = [] y = []
for doc in docs: for doc in docs:
t = doc.split('\t') y_with_doc = doc.split('\t')
y.append(t[0]) y.append(y_with_doc[0])
doc = t[1] doc = y_with_doc[1]
doc = doc.lower().split(' ') docs_preprocessed.append(preprocess(doc))
# doc = [''.join(char for char in word if char not in string.punctuation) for word in doc] y = [int(value) for value in y]
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc)) x = vectorizer.fit_transform(docs_preprocessed)
doc = ' '.join(doc) classifier.fit(x, y)
docs_preprocessed.append(doc)
y = [int(numeric_string) for numeric_string in y]
global d2v_model
d2v_model = TfidfVectorizer()
X = d2v_model.fit_transform(docs_preprocessed)
# X = scaler.fit_transform(X)
classifier.fit(X, y)
def classify(path): def classify(path):
with open(path + 'in.tsv') as f: with open(path + 'in.tsv') as f:
docs = [line.rstrip() for line in f] docs = [line.rstrip() for line in f]
docs_preprocessed = [] docs_preprocessed = []
for doc in docs: for doc in docs:
doc = doc.lower().split(' ') docs_preprocessed.append(preprocess(doc))
# doc = [''.join(char for char in word if char not in string.punctuation) for word in doc] test_x = vectorizer.transform(docs)
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc)) predictions = classifier.predict(test_x)
docs_preprocessed.append(doc)
test_vectors = d2v_model.transform(docs)
results = classifier.predict(test_vectors)
with open(path + 'out.tsv', 'w') as file: with open(path + 'out.tsv', 'w') as file:
for result in results: for prediction in predictions:
file.write("%i\n" % result) file.write("%i\n" % prediction)
train() train()
classify('dev-0/') classify('dev-0/')
# classify('test-A/', n_clusters=10) classify('test-A/')

5447
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff