2022-04-25 01:30:11 +02:00
|
|
|
import lzma
|
|
|
|
# import re
|
|
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
import csv
|
|
|
|
from sklearn.naive_bayes import GaussianNB
|
|
|
|
|
|
|
|
# def get_str_cleaned(str_dirty):
|
|
|
|
# punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
|
|
|
|
# new_str = str_dirty.lower()
|
|
|
|
# new_str = re.sub(' +', ' ', new_str)
|
|
|
|
# for char in punctuation:
|
|
|
|
# new_str = new_str.replace(char, '')
|
|
|
|
# new_str = new_str.replace('\n', '')
|
|
|
|
# return new_str
|
|
|
|
|
|
|
|
# with open('train/expected.tsv') as f:
|
|
|
|
# trainY = list(csv.reader(f))
|
|
|
|
|
|
|
|
trainX = []
|
|
|
|
trainY = []
|
|
|
|
testX = []
|
|
|
|
testY = []
|
|
|
|
|
|
|
|
with lzma.open('train/in.tsv.xz') as f:
|
|
|
|
for line in f:
|
|
|
|
# X_train.append(get_str_cleaned(line.decode('utf-8')))
|
|
|
|
trainX.append(line.decode('utf-8'))
|
|
|
|
|
|
|
|
with open('train/expected.tsv') as f:
|
|
|
|
for line in f:
|
|
|
|
trainY.append(line)
|
|
|
|
|
|
|
|
vectorizer = CountVectorizer()
|
|
|
|
trainX = vectorizer.fit_transform(trainX)
|
|
|
|
|
|
|
|
model = GaussianNB()
|
|
|
|
model.fit(trainX, trainY)
|
|
|
|
|
|
|
|
with open('dev-0/in.tsv') as f:
|
|
|
|
for line in f:
|
|
|
|
testX.append(line.decode('utf-8'))
|
|
|
|
# testX = list(csv.reader(f))
|
|
|
|
|
|
|
|
predictedY = model.predict(testX)
|
|
|
|
print(predictedY)
|
|
|
|
|
|
|
|
# with open('dev-0/expected.tsv') as f:
|
|
|
|
# expectedY = list(csv.reader(f))
|