Change to tfidf
This commit is contained in:
parent
d071aa92f7
commit
918dda14d2
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
.idea
|
@ -1,5 +1,6 @@
|
|||||||
import string
|
import string
|
||||||
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.cluster import KMeans
|
from sklearn.cluster import KMeans
|
||||||
from sklearn.naive_bayes import MultinomialNB
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
from sklearn.preprocessing import MinMaxScaler, normalize
|
from sklearn.preprocessing import MinMaxScaler, normalize
|
||||||
@ -23,15 +24,15 @@ def train():
|
|||||||
y.append(t[0])
|
y.append(t[0])
|
||||||
doc = t[1]
|
doc = t[1]
|
||||||
doc = doc.lower().split(' ')
|
doc = doc.lower().split(' ')
|
||||||
doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
|
# doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
|
||||||
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
|
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
|
||||||
|
doc = ' '.join(doc)
|
||||||
docs_preprocessed.append(doc)
|
docs_preprocessed.append(doc)
|
||||||
y = [int(numeric_string) for numeric_string in y]
|
y = [int(numeric_string) for numeric_string in y]
|
||||||
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
|
|
||||||
global d2v_model
|
global d2v_model
|
||||||
d2v_model = Doc2Vec(tagged_documents, epochs=300, dm=0)
|
d2v_model = TfidfVectorizer()
|
||||||
X = d2v_model.dv.vectors
|
X = d2v_model.fit_transform(docs_preprocessed)
|
||||||
X = scaler.fit_transform(X)
|
# X = scaler.fit_transform(X)
|
||||||
classifier.fit(X, y)
|
classifier.fit(X, y)
|
||||||
|
|
||||||
def classify(path):
|
def classify(path):
|
||||||
@ -40,12 +41,10 @@ def classify(path):
|
|||||||
docs_preprocessed = []
|
docs_preprocessed = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
doc = doc.lower().split(' ')
|
doc = doc.lower().split(' ')
|
||||||
doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
|
# doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
|
||||||
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
|
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
|
||||||
docs_preprocessed.append(doc)
|
docs_preprocessed.append(doc)
|
||||||
test_vectors = []
|
test_vectors = d2v_model.transform(docs)
|
||||||
for doc in docs_preprocessed:
|
|
||||||
test_vectors.append(d2v_model.infer_vector(doc))
|
|
||||||
results = classifier.predict(test_vectors)
|
results = classifier.predict(test_vectors)
|
||||||
with open(path + 'out.tsv', 'w') as file:
|
with open(path + 'out.tsv', 'w') as file:
|
||||||
for result in results:
|
for result in results:
|
||||||
|
2900
dev-0/out.tsv
2900
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user