Update run.py
This commit is contained in:
parent
5621e4ce9a
commit
61a9a4632a
BIN
._main.ipynb
Executable file
BIN
._main.ipynb
Executable file
Binary file not shown.
48
run.py
48
run.py
@ -0,0 +1,48 @@
|
|||||||
|
import lzma
|
||||||
|
# import re
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
import csv
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
|
||||||
|
# def get_str_cleaned(str_dirty):
|
||||||
|
# punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
|
||||||
|
# new_str = str_dirty.lower()
|
||||||
|
# new_str = re.sub(' +', ' ', new_str)
|
||||||
|
# for char in punctuation:
|
||||||
|
# new_str = new_str.replace(char, '')
|
||||||
|
# new_str = new_str.replace('\n', '')
|
||||||
|
# return new_str
|
||||||
|
|
||||||
|
# with open('train/expected.tsv') as f:
|
||||||
|
# trainY = list(csv.reader(f))
|
||||||
|
|
||||||
|
trainX = []
|
||||||
|
trainY = []
|
||||||
|
testX = []
|
||||||
|
testY = []
|
||||||
|
|
||||||
|
with lzma.open('train/in.tsv.xz') as f:
|
||||||
|
for line in f:
|
||||||
|
# X_train.append(get_str_cleaned(line.decode('utf-8')))
|
||||||
|
trainX.append(line.decode('utf-8'))
|
||||||
|
|
||||||
|
with open('train/expected.tsv') as f:
|
||||||
|
for line in f:
|
||||||
|
trainY.append(line)
|
||||||
|
|
||||||
|
vectorizer = CountVectorizer()
|
||||||
|
trainX = vectorizer.fit_transform(trainX)
|
||||||
|
|
||||||
|
model = GaussianNB()
|
||||||
|
model.fit(trainX, trainY)
|
||||||
|
|
||||||
|
with open('dev-0/in.tsv') as f:
|
||||||
|
for line in f:
|
||||||
|
testX.append(line.decode('utf-8'))
|
||||||
|
# testX = list(csv.reader(f))
|
||||||
|
|
||||||
|
predictedY = model.predict(testX)
|
||||||
|
print(predictedY)
|
||||||
|
|
||||||
|
# with open('dev-0/expected.tsv') as f:
|
||||||
|
# expectedY = list(csv.reader(f))
|
Loading…
Reference in New Issue
Block a user