Change to tfidf

This commit is contained in:
Jakub Kolasiński 2021-04-19 19:44:17 +02:00
parent d071aa92f7
commit 918dda14d2
3 changed files with 1459 additions and 1459 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
.idea

View File

@ -1,5 +1,6 @@
import string import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler, normalize from sklearn.preprocessing import MinMaxScaler, normalize
@ -23,15 +24,15 @@ def train():
y.append(t[0]) y.append(t[0])
doc = t[1] doc = t[1]
doc = doc.lower().split(' ') doc = doc.lower().split(' ')
doc = [''.join(char for char in word if char not in string.punctuation) for word in doc] # doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc)) doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
doc = ' '.join(doc)
docs_preprocessed.append(doc) docs_preprocessed.append(doc)
y = [int(numeric_string) for numeric_string in y] y = [int(numeric_string) for numeric_string in y]
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
global d2v_model global d2v_model
d2v_model = Doc2Vec(tagged_documents, epochs=300, dm=0) d2v_model = TfidfVectorizer()
X = d2v_model.dv.vectors X = d2v_model.fit_transform(docs_preprocessed)
X = scaler.fit_transform(X) # X = scaler.fit_transform(X)
classifier.fit(X, y) classifier.fit(X, y)
def classify(path): def classify(path):
@ -40,12 +41,10 @@ def classify(path):
docs_preprocessed = [] docs_preprocessed = []
for doc in docs: for doc in docs:
doc = doc.lower().split(' ') doc = doc.lower().split(' ')
doc = [''.join(char for char in word if char not in string.punctuation) for word in doc] # doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc)) doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
docs_preprocessed.append(doc) docs_preprocessed.append(doc)
test_vectors = [] test_vectors = d2v_model.transform(docs)
for doc in docs_preprocessed:
test_vectors.append(d2v_model.infer_vector(doc))
results = classifier.predict(test_vectors) results = classifier.predict(test_vectors)
with open(path + 'out.tsv', 'w') as file: with open(path + 'out.tsv', 'w') as file:
for result in results: for result in results:

File diff suppressed because it is too large Load Diff