Refactor

2021-04-19 20:00:40 +02:00 · 2021-04-19 20:00:40 +02:00 · 587bc42691
commit 587bc42691
parent 918dda14d2
2 changed files with 5470 additions and 29 deletions
--- a/classifier.py
+++ b/classifier.py
@ -1,11 +1,5 @@
 import string
 from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.preprocessing import MinMaxScaler, normalize
 scaler = MinMaxScaler()
 stopwords = []
 # stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
@ -13,6 +7,15 @@ with open('stopwords') as f:
    stopwords = [line.rstrip() for line in f]
 classifier = MultinomialNB()
 vectorizer = TfidfVectorizer()
 def preprocess(doc):
    doc = doc.lower().split(' ')
    doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
    doc = ' '.join(doc)
    return doc
 def train():
    with open('train/train.tsv') as f:
@ -20,37 +23,28 @@ def train():
    docs_preprocessed = []
    y = []
    for doc in docs:
-        t = doc.split('\t')
+        y_with_doc = doc.split('\t')
-        y.append(t[0])
+        y.append(y_with_doc[0])
-        doc = t[1]
+        doc = y_with_doc[1]
-        doc = doc.lower().split(' ')
+        docs_preprocessed.append(preprocess(doc))
-        # doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
+    y = [int(value) for value in y]
-        doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
+    x = vectorizer.fit_transform(docs_preprocessed)
-        doc = ' '.join(doc)
+    classifier.fit(x, y)
-        docs_preprocessed.append(doc)
+
    y = [int(numeric_string) for numeric_string in y]
    global d2v_model
    d2v_model = TfidfVectorizer()
    X = d2v_model.fit_transform(docs_preprocessed)
    # X = scaler.fit_transform(X)
    classifier.fit(X, y)
 def classify(path):
    with open(path + 'in.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    for doc in docs:
-        doc = doc.lower().split(' ')
+        docs_preprocessed.append(preprocess(doc))
-        # doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
+    test_x = vectorizer.transform(docs)
-        doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
+    predictions = classifier.predict(test_x)
        docs_preprocessed.append(doc)
    test_vectors = d2v_model.transform(docs)
    results = classifier.predict(test_vectors)
    with open(path + 'out.tsv', 'w') as file:
-        for result in results:
+        for prediction in predictions:
-            file.write("%i\n" % result)
+            file.write("%i\n" % prediction)
 train()
 classify('dev-0/')
-# classify('test-A/', n_clusters=10)
+classify('test-A/')
--- a/test-A/out.tsv
+++ b/test-A/out.tsv