64 lines
1.8 KiB
Python
64 lines
1.8 KiB
Python
import csv
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from stop_words import get_stop_words
|
|
|
|
|
|
def to_n(word, n):
|
|
if len(word) < n + 1:
|
|
return word
|
|
else:
|
|
return word[:n]
|
|
|
|
|
|
def stem(sentence):
|
|
return ' '.join([to_n(word, 7) for word in sentence.split()])
|
|
|
|
|
|
def remove_specials(text):
|
|
to_replace = '.,<>)(*&^%$#@~;:!?-_=+/\\\'\"|{}[]012345679'
|
|
for spec in to_replace:
|
|
text = text.replace(spec, '')
|
|
return text
|
|
|
|
|
|
df = pd.read_csv('train/train.tsv.gz', sep='\t', compression='gzip', names=['label', 'text'])
|
|
|
|
df['text'] = [stem(remove_specials(x.lower())) for x in df['text']]
|
|
|
|
vectorizer = TfidfVectorizer(stop_words=get_stop_words('polish'))
|
|
|
|
x = vectorizer.fit_transform(df['text'])
|
|
|
|
labels = df.pop('label')
|
|
|
|
bayes = MultinomialNB()
|
|
bayes.fit(x, labels)
|
|
# ----------------------------------------------------------------------------------------------------------------------
|
|
t_df = pd.read_csv('dev-0/in.tsv', sep='\t', names=['text'])
|
|
tlabs = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['text'])
|
|
|
|
t_df['text'] = [stem(remove_specials(x.lower())) for x in t_df['text']]
|
|
|
|
vecs = vectorizer.transform(t_df['text'])
|
|
|
|
predict = bayes.predict(vecs)
|
|
with open('dev-0/out.tsv', 'w') as f:
|
|
tsvf = csv.writer(f, delimiter='\n')
|
|
tsvf.writerow(predict)
|
|
score = bayes.score(vecs, tlabs)
|
|
print(score)
|
|
# ----------------------------------------------------------------------------------------------------------------------
|
|
|
|
t_df = pd.read_csv('test-A/in.tsv', sep='\t', names=['text'])
|
|
|
|
t_df['text'] = [stem(remove_specials(x.lower())) for x in t_df['text']]
|
|
|
|
vecs = vectorizer.transform(t_df['text'])
|
|
|
|
predict = bayes.predict(vecs)
|
|
with open('test-A/out.tsv', 'w') as f:
|
|
tsvf = csv.writer(f, delimiter='\n')
|
|
tsvf.writerow(predict)
|