sport-text-classification-b.../classifier.py

57 lines
2.0 KiB
Python
Raw Normal View History

2021-04-19 19:17:10 +02:00
import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler, normalize
scaler = MinMaxScaler()
stopwords = []
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
with open('stopwords') as f:
stopwords = [line.rstrip() for line in f]
classifier = MultinomialNB()
def train():
with open('train/train.tsv') as f:
docs = [line.rstrip() for line in f]
docs_preprocessed = []
y = []
for doc in docs:
t = doc.split('\t')
y.append(t[0])
doc = t[1]
doc = doc.lower().split(' ')
doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
docs_preprocessed.append(doc)
y = [int(numeric_string) for numeric_string in y]
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
global d2v_model
d2v_model = Doc2Vec(tagged_documents, epochs=300, dm=0)
X = d2v_model.dv.vectors
X = scaler.fit_transform(X)
classifier.fit(X, y)
def classify(path):
with open(path + 'in.tsv') as f:
docs = [line.rstrip() for line in f]
docs_preprocessed = []
for doc in docs:
doc = doc.lower().split(' ')
doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
docs_preprocessed.append(doc)
test_vectors = []
for doc in docs_preprocessed:
test_vectors.append(d2v_model.infer_vector(doc))
results = classifier.predict(test_vectors)
with open(path + 'out.tsv', 'w') as file:
for result in results:
file.write("%i\n" % result)
train()
classify('dev-0/')
# classify('test-A/', n_clusters=10)