forked from kubapok/retroc2
Fix preprocessing
This commit is contained in:
parent
6a796f68ee
commit
e328161046
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,2 +1,4 @@
|
||||
.idea
|
||||
train.tsv
|
||||
*.sav
|
||||
*.pickle
|
||||
|
39998
dev-0/out.tsv
39998
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
BIN
regressor.sav
BIN
regressor.sav
Binary file not shown.
47
solution.py
47
solution.py
@ -2,22 +2,12 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LinearRegression
|
||||
import pickle
|
||||
|
||||
stopwords = []
|
||||
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
|
||||
with open('stopwords.txt') as f:
|
||||
stopwords = [line.rstrip() for line in f]
|
||||
|
||||
filename = 'regressor.sav'
|
||||
vec_file = 'vectorizer.pickle'
|
||||
regressor = LinearRegression()
|
||||
# regressor = pickle.load(open(filename, 'rb'))
|
||||
vectorizer = TfidfVectorizer()
|
||||
|
||||
|
||||
def preprocess(doc):
|
||||
doc = doc.lower().split(' ')
|
||||
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
|
||||
doc = ' '.join(doc)
|
||||
return doc
|
||||
# vectorizer = pickle.load(open(vec_file, 'rb'))
|
||||
|
||||
|
||||
def train():
|
||||
@ -25,32 +15,32 @@ def train():
|
||||
docs = [line.rstrip() for line in f]
|
||||
docs_preprocessed = []
|
||||
y = []
|
||||
for doc in docs[:1000]:
|
||||
for doc in docs:
|
||||
row = doc.split('\t')
|
||||
start = row[0]
|
||||
end = row[1]
|
||||
end = end.split(' ')
|
||||
if len(end) > 1:
|
||||
row.insert(4, end[1])
|
||||
end = end[0]
|
||||
rest = row[4:]
|
||||
preprocessed = rest[0]
|
||||
docs_preprocessed.append(preprocessed)
|
||||
docs_preprocessed.append(preprocessed)
|
||||
y.append(start)
|
||||
y.append(end)
|
||||
start_date = row[0]
|
||||
end_date = row[1]
|
||||
end_date = end_date.split(' ')
|
||||
if len(end_date) > 1:
|
||||
row.insert(4, end_date[1])
|
||||
end_date = end_date[0]
|
||||
doc = row[4:5][0]
|
||||
docs_preprocessed.append(doc)
|
||||
y.append((float(start_date) + float(end_date))/2)
|
||||
y = [float(value) for value in y]
|
||||
print('Fitting vectorizer...')
|
||||
x = vectorizer.fit_transform(docs_preprocessed)
|
||||
pickle.dump(vectorizer, open(vec_file, 'wb'))
|
||||
print('DONE!')
|
||||
print('Fitting regressor...')
|
||||
regressor.fit(x, y)
|
||||
pickle.dump(regressor, open(filename, 'wb'))
|
||||
print('DONE!')
|
||||
|
||||
|
||||
def classify(path):
|
||||
print("Predicting for", path)
|
||||
with open(path + 'in.tsv') as f:
|
||||
docs = [line.rstrip() for line in f]
|
||||
docs_preprocessed = []
|
||||
for doc in docs:
|
||||
docs_preprocessed.append(preprocess(doc))
|
||||
test_x = vectorizer.transform(docs)
|
||||
predictions = regressor.predict(test_x)
|
||||
with open(path + 'out.tsv', 'w') as file:
|
||||
@ -60,5 +50,6 @@ def classify(path):
|
||||
|
||||
train()
|
||||
classify('dev-0/')
|
||||
# classify('dev-1/')
|
||||
# classify('test-A/')
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user