Refactor
This commit is contained in:
parent
918dda14d2
commit
587bc42691
@ -1,11 +1,5 @@
|
|||||||
import string
|
|
||||||
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.cluster import KMeans
|
|
||||||
from sklearn.naive_bayes import MultinomialNB
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
from sklearn.preprocessing import MinMaxScaler, normalize
|
|
||||||
|
|
||||||
scaler = MinMaxScaler()
|
|
||||||
|
|
||||||
stopwords = []
|
stopwords = []
|
||||||
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
|
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
|
||||||
@ -13,6 +7,15 @@ with open('stopwords') as f:
|
|||||||
stopwords = [line.rstrip() for line in f]
|
stopwords = [line.rstrip() for line in f]
|
||||||
|
|
||||||
classifier = MultinomialNB()
|
classifier = MultinomialNB()
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(doc):
|
||||||
|
doc = doc.lower().split(' ')
|
||||||
|
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
|
||||||
|
doc = ' '.join(doc)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
def train():
|
def train():
|
||||||
with open('train/train.tsv') as f:
|
with open('train/train.tsv') as f:
|
||||||
@ -20,37 +23,28 @@ def train():
|
|||||||
docs_preprocessed = []
|
docs_preprocessed = []
|
||||||
y = []
|
y = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
t = doc.split('\t')
|
y_with_doc = doc.split('\t')
|
||||||
y.append(t[0])
|
y.append(y_with_doc[0])
|
||||||
doc = t[1]
|
doc = y_with_doc[1]
|
||||||
doc = doc.lower().split(' ')
|
docs_preprocessed.append(preprocess(doc))
|
||||||
# doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
|
y = [int(value) for value in y]
|
||||||
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
|
x = vectorizer.fit_transform(docs_preprocessed)
|
||||||
doc = ' '.join(doc)
|
classifier.fit(x, y)
|
||||||
docs_preprocessed.append(doc)
|
|
||||||
y = [int(numeric_string) for numeric_string in y]
|
|
||||||
global d2v_model
|
|
||||||
d2v_model = TfidfVectorizer()
|
|
||||||
X = d2v_model.fit_transform(docs_preprocessed)
|
|
||||||
# X = scaler.fit_transform(X)
|
|
||||||
classifier.fit(X, y)
|
|
||||||
|
|
||||||
def classify(path):
|
def classify(path):
|
||||||
with open(path + 'in.tsv') as f:
|
with open(path + 'in.tsv') as f:
|
||||||
docs = [line.rstrip() for line in f]
|
docs = [line.rstrip() for line in f]
|
||||||
docs_preprocessed = []
|
docs_preprocessed = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
doc = doc.lower().split(' ')
|
docs_preprocessed.append(preprocess(doc))
|
||||||
# doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
|
test_x = vectorizer.transform(docs)
|
||||||
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
|
predictions = classifier.predict(test_x)
|
||||||
docs_preprocessed.append(doc)
|
|
||||||
test_vectors = d2v_model.transform(docs)
|
|
||||||
results = classifier.predict(test_vectors)
|
|
||||||
with open(path + 'out.tsv', 'w') as file:
|
with open(path + 'out.tsv', 'w') as file:
|
||||||
for result in results:
|
for prediction in predictions:
|
||||||
file.write("%i\n" % result)
|
file.write("%i\n" % prediction)
|
||||||
|
|
||||||
|
|
||||||
train()
|
train()
|
||||||
classify('dev-0/')
|
classify('dev-0/')
|
||||||
# classify('test-A/', n_clusters=10)
|
classify('test-A/')
|
||||||
|
5447
test-A/out.tsv
Normal file
5447
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user