sport-text-classification-b.../classifier.py

import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler, normalize

scaler = MinMaxScaler()

stopwords = []
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
with open('stopwords') as f:
    stopwords = [line.rstrip() for line in f]

classifier = MultinomialNB()

def train():
    with open('train/train.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    y = []
    for doc in docs:
        t = doc.split('\t')
        y.append(t[0])
        doc = t[1]
        doc = doc.lower().split(' ')
        # doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
        doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
        doc = ' '.join(doc)
        docs_preprocessed.append(doc)
    y = [int(numeric_string) for numeric_string in y]
    global d2v_model
    d2v_model = TfidfVectorizer()
    X = d2v_model.fit_transform(docs_preprocessed)
    # X = scaler.fit_transform(X)
    classifier.fit(X, y)

def classify(path):
    with open(path + 'in.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    for doc in docs:
        doc = doc.lower().split(' ')
        # doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
        doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
        docs_preprocessed.append(doc)
    test_vectors = d2v_model.transform(docs)
    results = classifier.predict(test_vectors)
    with open(path + 'out.tsv', 'w') as file:
        for result in results:
            file.write("%i\n" % result)


train()
classify('dev-0/')
# classify('test-A/', n_clusters=10)
no message 2021-04-19 19:17:10 +02:00			`import string`
			`from gensim.models.doc2vec import Doc2Vec, TaggedDocument`
Change to tfidf 2021-04-19 19:44:17 +02:00			`from sklearn.feature_extraction.text import TfidfVectorizer`
no message 2021-04-19 19:17:10 +02:00			`from sklearn.cluster import KMeans`
			`from sklearn.naive_bayes import MultinomialNB`
			`from sklearn.preprocessing import MinMaxScaler, normalize`

			`scaler = MinMaxScaler()`

			`stopwords = []`
			`# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt`
			`with open('stopwords') as f:`
			`stopwords = [line.rstrip() for line in f]`

			`classifier = MultinomialNB()`

			`def train():`
			`with open('train/train.tsv') as f:`
			`docs = [line.rstrip() for line in f]`
			`docs_preprocessed = []`
			`y = []`
			`for doc in docs:`
			`t = doc.split('\t')`
			`y.append(t[0])`
			`doc = t[1]`
			`doc = doc.lower().split(' ')`
Change to tfidf 2021-04-19 19:44:17 +02:00			`# doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]`
no message 2021-04-19 19:17:10 +02:00			`doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))`
Change to tfidf 2021-04-19 19:44:17 +02:00			`doc = ' '.join(doc)`
no message 2021-04-19 19:17:10 +02:00			`docs_preprocessed.append(doc)`
			`y = [int(numeric_string) for numeric_string in y]`
			`global d2v_model`
Change to tfidf 2021-04-19 19:44:17 +02:00			`d2v_model = TfidfVectorizer()`
			`X = d2v_model.fit_transform(docs_preprocessed)`
			`# X = scaler.fit_transform(X)`
no message 2021-04-19 19:17:10 +02:00			`classifier.fit(X, y)`

			`def classify(path):`
			`with open(path + 'in.tsv') as f:`
			`docs = [line.rstrip() for line in f]`
			`docs_preprocessed = []`
			`for doc in docs:`
			`doc = doc.lower().split(' ')`
Change to tfidf 2021-04-19 19:44:17 +02:00			`# doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]`
no message 2021-04-19 19:17:10 +02:00			`doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))`
			`docs_preprocessed.append(doc)`
Change to tfidf 2021-04-19 19:44:17 +02:00			`test_vectors = d2v_model.transform(docs)`
no message 2021-04-19 19:17:10 +02:00			`results = classifier.predict(test_vectors)`
			`with open(path + 'out.tsv', 'w') as file:`
			`for result in results:`
			`file.write("%i\n" % result)`


			`train()`
			`classify('dev-0/')`
			`# classify('test-A/', n_clusters=10)`