Fix preprocessing

2021-04-28 20:32:51 +02:00 · 2021-04-28 20:32:51 +02:00 · e328161046
commit e328161046
parent 6a796f68ee
5 changed files with 20020 additions and 20027 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@
 .idea
 train.tsv
+*.sav
+*.pickle
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/BIN
+++ b/BIN
--- a/regressor.sav
+++ b/regressor.sav
--- a/solution.py
+++ b/solution.py
@ -2,22 +2,12 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LinearRegression
 import pickle

-stopwords = []
-# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
-with open('stopwords.txt') as f:
-    stopwords = [line.rstrip() for line in f]
-
 filename = 'regressor.sav'
+vec_file = 'vectorizer.pickle'
 regressor = LinearRegression()
 # regressor = pickle.load(open(filename, 'rb'))
 vectorizer = TfidfVectorizer()
-
-
-def preprocess(doc):
-    doc = doc.lower().split(' ')
-    doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
-    doc = ' '.join(doc)
-    return doc
+# vectorizer = pickle.load(open(vec_file, 'rb'))


 def train():
@ -25,32 +15,32 @@ def train():
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    y = []
-    for doc in docs[:1000]:
+    for doc in docs:
        row = doc.split('\t')
-        start = row[0]
-        end = row[1]
-        end = end.split(' ')
-        if len(end) > 1:
-            row.insert(4, end[1])
-        end = end[0]
-        rest = row[4:]
-        preprocessed = rest[0]
-        docs_preprocessed.append(preprocessed)
-        docs_preprocessed.append(preprocessed)
-        y.append(start)
-        y.append(end)
+        start_date = row[0]
+        end_date = row[1]
+        end_date = end_date.split(' ')
+        if len(end_date) > 1:
+            row.insert(4, end_date[1])
+        end_date = end_date[0]
+        doc = row[4:5][0]
+        docs_preprocessed.append(doc)
+        y.append((float(start_date) + float(end_date))/2)
    y = [float(value) for value in y]
+    print('Fitting vectorizer...')
    x = vectorizer.fit_transform(docs_preprocessed)
+    pickle.dump(vectorizer, open(vec_file, 'wb'))
+    print('DONE!')
+    print('Fitting regressor...')
    regressor.fit(x, y)
    pickle.dump(regressor, open(filename, 'wb'))
+    print('DONE!')


 def classify(path):
+    print("Predicting for", path)
    with open(path + 'in.tsv') as f:
        docs = [line.rstrip() for line in f]
-    docs_preprocessed = []
-    for doc in docs:
-        docs_preprocessed.append(preprocess(doc))
    test_x = vectorizer.transform(docs)
    predictions = regressor.predict(test_x)
    with open(path + 'out.tsv', 'w') as file:
@ -60,5 +50,6 @@ def classify(path):

 train()
 classify('dev-0/')
+# classify('dev-1/')
 # classify('test-A/')