diff --git a/._main.ipynb b/._main.ipynb new file mode 100755 index 0000000..ac1fd30 Binary files /dev/null and b/._main.ipynb differ diff --git a/run.py b/run.py index e69de29..8f06fa1 100755 --- a/run.py +++ b/run.py @@ -0,0 +1,48 @@ +import lzma +# import re +from sklearn.feature_extraction.text import CountVectorizer +import csv +from sklearn.naive_bayes import GaussianNB + +# def get_str_cleaned(str_dirty): +# punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~' +# new_str = str_dirty.lower() +# new_str = re.sub(' +', ' ', new_str) +# for char in punctuation: +# new_str = new_str.replace(char, '') +# new_str = new_str.replace('\n', '') +# return new_str + +# with open('train/expected.tsv') as f: +# trainY = list(csv.reader(f)) + +trainX = [] +trainY = [] +testX = [] +testY = [] + +with lzma.open('train/in.tsv.xz') as f: + for line in f: + # X_train.append(get_str_cleaned(line.decode('utf-8'))) + trainX.append(line.decode('utf-8')) + +with open('train/expected.tsv') as f: + for line in f: + trainY.append(line) + +vectorizer = CountVectorizer() +trainX = vectorizer.fit_transform(trainX) + +model = GaussianNB() +model.fit(trainX, trainY) + +with open('dev-0/in.tsv') as f: + for line in f: + testX.append(line.decode('utf-8')) + # testX = list(csv.reader(f)) + +predictedY = model.predict(testX) +print(predictedY) + +# with open('dev-0/expected.tsv') as f: +# expectedY = list(csv.reader(f)) \ No newline at end of file