Compare commits
2 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
0238b238d2 | ||
|
d2d308a78a |
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
54
naivebayes.py
Normal file
54
naivebayes.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
from sklearn.naive_bayes import GaussianNB, MultinomialNB
|
||||||
|
from sklearn.pipeline import make_pipeline, Pipeline
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
|
|
||||||
|
# pogarsza wynik z 0.73 na 0.7
|
||||||
|
def preprocess(line, stop_words):
|
||||||
|
return " ".join([word for word in line.split() if word not in stop_words])
|
||||||
|
|
||||||
|
|
||||||
|
def train_model(train_in, train_expected):
|
||||||
|
with open(train_expected, 'r') as f:
|
||||||
|
exp = f.readlines()
|
||||||
|
|
||||||
|
with open(train_in, 'r') as f:
|
||||||
|
train_data = f.readlines()
|
||||||
|
|
||||||
|
exp_encoded = LabelEncoder().fit_transform(exp)
|
||||||
|
|
||||||
|
# vectors = TfidfVectorizer().fit_transform(train_data)
|
||||||
|
# vectors = vectors.reshape(-1, 1)
|
||||||
|
# model = MultinomialNB()
|
||||||
|
# return model.fit(vectors, exp_encoded)
|
||||||
|
# MemoryError
|
||||||
|
|
||||||
|
pipeline = Pipeline(steps=[
|
||||||
|
('tfidf', TfidfVectorizer()),
|
||||||
|
('naive-bayes', MultinomialNB())
|
||||||
|
])
|
||||||
|
|
||||||
|
return pipeline.fit(train_data, exp_encoded)
|
||||||
|
|
||||||
|
|
||||||
|
def predict(model, in_file, out_file):
|
||||||
|
with open(in_file, 'r') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
prediction = model.predict(lines)
|
||||||
|
np.savetxt(out_file, prediction, fmt='%d')
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
#stop_words = set(stopwords.words('english'))
|
||||||
|
model = train_model("train/in.tsv", "train/expected.tsv")
|
||||||
|
predict(model, "dev-0/in.tsv", "dev-0/out.tsv")
|
||||||
|
predict(model, "test-A/in.tsv", "test-A/out.tsv")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user