paranormal-or-skeptic-ISI-p.../naivebayes.py
Aleksy Wroblewski 0238b238d2 Run autopep8
2021-04-22 22:03:47 +02:00

55 lines
1.4 KiB
Python

import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
# pogarsza wynik z 0.73 na 0.7
def preprocess(line, stop_words):
return " ".join([word for word in line.split() if word not in stop_words])
def train_model(train_in, train_expected):
with open(train_expected, 'r') as f:
exp = f.readlines()
with open(train_in, 'r') as f:
train_data = f.readlines()
exp_encoded = LabelEncoder().fit_transform(exp)
# vectors = TfidfVectorizer().fit_transform(train_data)
# vectors = vectors.reshape(-1, 1)
# model = MultinomialNB()
# return model.fit(vectors, exp_encoded)
# MemoryError
pipeline = Pipeline(steps=[
('tfidf', TfidfVectorizer()),
('naive-bayes', MultinomialNB())
])
return pipeline.fit(train_data, exp_encoded)
def predict(model, in_file, out_file):
with open(in_file, 'r') as f:
lines = f.readlines()
prediction = model.predict(lines)
np.savetxt(out_file, prediction, fmt='%d')
def main():
#stop_words = set(stopwords.words('english'))
model = train_model("train/in.tsv", "train/expected.tsv")
predict(model, "dev-0/in.tsv", "dev-0/out.tsv")
predict(model, "test-A/in.tsv", "test-A/out.tsv")
if __name__ == '__main__':
main()