forked from kubapok/paranormal-or-skeptic-ISI-public
170 lines
2.1 KiB
Python
170 lines
2.1 KiB
Python
#!/usr/bin/env python
|
|
# coding: utf-8
|
|
|
|
# In[1]:
|
|
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import patoolib
|
|
import os
|
|
import patoolib
|
|
from sklearn.preprocessing import LabelEncoder
|
|
from sklearn.naive_bayes import GaussianNB, MultinomialNB
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
|
|
# ## TRENING
|
|
|
|
# #### ROZPAKOWANIE I WCZYTANIE
|
|
|
|
# In[2]:
|
|
|
|
|
|
EXPECTED_FILE = open('train/expected.tsv', 'r', encoding="utf-8")
|
|
|
|
patoolib.extract_archive("train/in.tsv.xz", outdir="train/")
|
|
TRAIN = open('train/in.tsv', 'r', encoding="utf-8")
|
|
|
|
|
|
# #### WRZUCENIE DO ZMIENNYCH
|
|
|
|
# In[3]:
|
|
|
|
|
|
EXPECTED = []
|
|
for line in EXPECTED_FILE:
|
|
EXPECTED.append(line)
|
|
|
|
|
|
# In[4]:
|
|
|
|
|
|
TRAIN_DATA = []
|
|
for line in TRAIN:
|
|
TRAIN_DATA.append(line)
|
|
|
|
|
|
# #### ZAMKNIECIE ZMIENNYCH PLIKOW I USUNIECIE ROZPAKOWANIA
|
|
|
|
# In[5]:
|
|
|
|
|
|
EXPECTED_FILE.close()
|
|
TRAIN.close()
|
|
#os.remove("train/in.tsv")
|
|
|
|
|
|
# #### MODEL TRENINGOWY
|
|
|
|
# In[6]:
|
|
|
|
|
|
EXPECTED_ENCODER = LabelEncoder().fit_transform(EXPECTED)
|
|
|
|
|
|
# In[7]:
|
|
|
|
|
|
PIPE = Pipeline(steps=[("TF-IDF",TfidfVectorizer()), ("BAYES", MultinomialNB())])
|
|
|
|
|
|
# In[8]:
|
|
|
|
|
|
TRAIN_MODEL = PIPE.fit(TRAIN_DATA, EXPECTED_ENCODER)
|
|
|
|
|
|
# ## FUNKCJE
|
|
|
|
# In[9]:
|
|
|
|
|
|
def BayesFit(MODEL, DOC):
|
|
PREDICTION = MODEL.predict(DOC)
|
|
return PREDICTION
|
|
|
|
|
|
# ## PLIK DEV-0
|
|
|
|
# In[10]:
|
|
|
|
|
|
patoolib.extract_archive("dev-0/in.tsv.xz", outdir="dev-0/")
|
|
INFILE = open('dev-0/in.tsv', 'r', encoding="utf-8")
|
|
|
|
OUTFILE = open("dev-0/out.tsv", "w")
|
|
|
|
|
|
# In[11]:
|
|
|
|
|
|
ALL_DOC = INFILE.readlines()
|
|
|
|
|
|
# In[12]:
|
|
|
|
|
|
RESULT = BayesFit(TRAIN_MODEL, ALL_DOC)
|
|
|
|
|
|
# In[13]:
|
|
|
|
|
|
for x in RESULT:
|
|
OUTFILE.write(str(x) + '\n')
|
|
|
|
|
|
# In[14]:
|
|
|
|
|
|
INFILE.close()
|
|
OUTFILE.close()
|
|
#os.remove("dev-0/in.tsv")
|
|
|
|
|
|
# ## PLIK TEST-A
|
|
|
|
# In[15]:
|
|
|
|
|
|
patoolib.extract_archive("test-A/in.tsv.xz", outdir="test-A/")
|
|
INFILE = open('test-A/in.tsv', 'r', encoding="utf-8")
|
|
|
|
OUTFILE = open("test-A/out.tsv", "w")
|
|
|
|
|
|
# In[16]:
|
|
|
|
|
|
ALL_DOC = INFILE.readlines()
|
|
|
|
|
|
# In[17]:
|
|
|
|
|
|
RESULT = BayesFit(TRAIN_MODEL, ALL_DOC)
|
|
|
|
|
|
# In[18]:
|
|
|
|
|
|
for x in RESULT:
|
|
OUTFILE.write(str(x) + '\n')
|
|
|
|
|
|
# In[19]:
|
|
|
|
|
|
INFILE.close()
|
|
OUTFILE.close()
|
|
#os.remove("test-A/in.tsv")
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|