Delete trash
This commit is contained in:
parent
d9d70b1335
commit
048e0fb186
BIN
._main.ipynb
BIN
._main.ipynb
Binary file not shown.
152
main.ipynb
152
main.ipynb
File diff suppressed because one or more lines are too long
48
run.py
48
run.py
@ -1,48 +0,0 @@
|
|||||||
import lzma
|
|
||||||
# import re
|
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
|
||||||
import csv
|
|
||||||
from sklearn.naive_bayes import GaussianNB
|
|
||||||
|
|
||||||
# def get_str_cleaned(str_dirty):
|
|
||||||
# punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
|
|
||||||
# new_str = str_dirty.lower()
|
|
||||||
# new_str = re.sub(' +', ' ', new_str)
|
|
||||||
# for char in punctuation:
|
|
||||||
# new_str = new_str.replace(char, '')
|
|
||||||
# new_str = new_str.replace('\n', '')
|
|
||||||
# return new_str
|
|
||||||
|
|
||||||
# with open('train/expected.tsv') as f:
|
|
||||||
# trainY = list(csv.reader(f))
|
|
||||||
|
|
||||||
trainX = []
|
|
||||||
trainY = []
|
|
||||||
testX = []
|
|
||||||
testY = []
|
|
||||||
|
|
||||||
with lzma.open('train/in.tsv.xz') as f:
|
|
||||||
for line in f:
|
|
||||||
# X_train.append(get_str_cleaned(line.decode('utf-8')))
|
|
||||||
trainX.append(line.decode('utf-8'))
|
|
||||||
|
|
||||||
with open('train/expected.tsv') as f:
|
|
||||||
for line in f:
|
|
||||||
trainY.append(line)
|
|
||||||
|
|
||||||
vectorizer = CountVectorizer()
|
|
||||||
trainX = vectorizer.fit_transform(trainX)
|
|
||||||
|
|
||||||
model = GaussianNB()
|
|
||||||
model.fit(trainX, trainY)
|
|
||||||
|
|
||||||
with open('dev-0/in.tsv') as f:
|
|
||||||
for line in f:
|
|
||||||
testX.append(line.decode('utf-8'))
|
|
||||||
# testX = list(csv.reader(f))
|
|
||||||
|
|
||||||
predictedY = model.predict(testX)
|
|
||||||
print(predictedY)
|
|
||||||
|
|
||||||
# with open('dev-0/expected.tsv') as f:
|
|
||||||
# expectedY = list(csv.reader(f))
|
|
Loading…
Reference in New Issue
Block a user