Change to tfidf

2021-04-19 19:44:17 +02:00 · 2021-04-19 19:44:17 +02:00 · 918dda14d2
commit 918dda14d2
parent d071aa92f7
3 changed files with 1459 additions and 1459 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+.idea
--- a/classifier.py
+++ b/classifier.py
@ -1,5 +1,6 @@
 import string
 from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.preprocessing import MinMaxScaler, normalize
@ -23,15 +24,15 @@ def train():
        y.append(t[0])
        doc = t[1]
        doc = doc.lower().split(' ')
-        doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
+        # doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
        doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
+        doc = ' '.join(doc)
        docs_preprocessed.append(doc)
    y = [int(numeric_string) for numeric_string in y]
-    tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
    global d2v_model
-    d2v_model = Doc2Vec(tagged_documents, epochs=300, dm=0)
-    X = d2v_model.dv.vectors
-    X = scaler.fit_transform(X)
+    d2v_model = TfidfVectorizer()
+    X = d2v_model.fit_transform(docs_preprocessed)
+    # X = scaler.fit_transform(X)
    classifier.fit(X, y)

 def classify(path):
@ -40,12 +41,10 @@ def classify(path):
    docs_preprocessed = []
    for doc in docs:
        doc = doc.lower().split(' ')
-        doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
+        # doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
        doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
        docs_preprocessed.append(doc)
-    test_vectors = []
-    for doc in docs_preprocessed:
-        test_vectors.append(d2v_model.infer_vector(doc))
+    test_vectors = d2v_model.transform(docs)
    results = classifier.predict(test_vectors)
    with open(path + 'out.tsv', 'w') as file:
        for result in results:
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv