Refactor

2021-04-19 20:00:40 +02:00 · 2021-04-19 20:00:40 +02:00 · 587bc42691
commit 587bc42691
parent 918dda14d2
2 changed files with 5470 additions and 29 deletions
--- a/classifier.py
+++ b/classifier.py
@ -1,11 +1,5 @@
-import string
-from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.cluster import KMeans
 from sklearn.naive_bayes import MultinomialNB
-from sklearn.preprocessing import MinMaxScaler, normalize
-
-scaler = MinMaxScaler()

 stopwords = []
 # stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
@ -13,6 +7,15 @@ with open('stopwords') as f:
    stopwords = [line.rstrip() for line in f]

 classifier = MultinomialNB()
+vectorizer = TfidfVectorizer()
+
+
+def preprocess(doc):
+    doc = doc.lower().split(' ')
+    doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
+    doc = ' '.join(doc)
+    return doc
+

 def train():
    with open('train/train.tsv') as f:
@ -20,37 +23,28 @@ def train():
    docs_preprocessed = []
    y = []
    for doc in docs:
-        t = doc.split('\t')
-        y.append(t[0])
-        doc = t[1]
-        doc = doc.lower().split(' ')
-        # doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
-        doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
-        doc = ' '.join(doc)
-        docs_preprocessed.append(doc)
-    y = [int(numeric_string) for numeric_string in y]
-    global d2v_model
-    d2v_model = TfidfVectorizer()
-    X = d2v_model.fit_transform(docs_preprocessed)
-    # X = scaler.fit_transform(X)
-    classifier.fit(X, y)
+        y_with_doc = doc.split('\t')
+        y.append(y_with_doc[0])
+        doc = y_with_doc[1]
+        docs_preprocessed.append(preprocess(doc))
+    y = [int(value) for value in y]
+    x = vectorizer.fit_transform(docs_preprocessed)
+    classifier.fit(x, y)
+

 def classify(path):
    with open(path + 'in.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    for doc in docs:
-        doc = doc.lower().split(' ')
-        # doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
-        doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
-        docs_preprocessed.append(doc)
-    test_vectors = d2v_model.transform(docs)
-    results = classifier.predict(test_vectors)
+        docs_preprocessed.append(preprocess(doc))
+    test_x = vectorizer.transform(docs)
+    predictions = classifier.predict(test_x)
    with open(path + 'out.tsv', 'w') as file:
-        for result in results:
-            file.write("%i\n" % result)
+        for prediction in predictions:
+            file.write("%i\n" % prediction)


 train()
 classify('dev-0/')
-# classify('test-A/', n_clusters=10)
+classify('test-A/')
--- a/test-A/out.tsv
+++ b/test-A/out.tsv