add bayes
This commit is contained in:
parent
9cb2fb2612
commit
4ea8113b15
6
.ipynb_checkpoints/Untitled-checkpoint.ipynb
Normal file
6
.ipynb_checkpoints/Untitled-checkpoint.ipynb
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"cells": [],
|
||||||
|
"metadata": {},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
82
Untitled.ipynb
Normal file
82
Untitled.ipynb
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n",
|
||||||
|
"b'Skipping line 1983: expected 1 fields, saw 2\\nSkipping line 5199: expected 1 fields, saw 2\\n'\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.naive_bayes import GaussianNB\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
|
"\n",
|
||||||
|
"r_in = './train/train.tsv'\n",
|
||||||
|
"\n",
|
||||||
|
"r_ind_ev = './dev-0/in.tsv'\n",
|
||||||
|
"tsv_read = pd.read_table(r_in, error_bad_lines=False, sep='\\t', header=None)\n",
|
||||||
|
"tsv_read_dev = pd.read_table(r_ind_ev, error_bad_lines=False, sep='\\t', header=None)\n",
|
||||||
|
"\n",
|
||||||
|
"y_train = tsv_read[0].values\n",
|
||||||
|
"X_train = tsv_read[1].values\n",
|
||||||
|
"X_dev = tsv_read_dev[0].values\n",
|
||||||
|
"\n",
|
||||||
|
"vectorizer = TfidfVectorizer()\n",
|
||||||
|
"counts = vectorizer.fit_transform(X_train)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"classifier = MultinomialNB()\n",
|
||||||
|
"classifier.fit(counts, y_train)\n",
|
||||||
|
"\n",
|
||||||
|
"counts2 = vectorizer.transform(X_dev)\n",
|
||||||
|
"predictions = classifier.predict(counts2)\n",
|
||||||
|
"\n",
|
||||||
|
"predictions.tofile(\"./dev-0/out.tsv\", sep='\\n')\n",
|
||||||
|
"\n",
|
||||||
|
"tsv_read_test_in = pd.read_table('./test-A/in.tsv', error_bad_lines=False, header= None)\n",
|
||||||
|
"X_test= tsv_read_test_in[0].values\n",
|
||||||
|
"\n",
|
||||||
|
"counts3 = vectorizer.transform(X_test)\n",
|
||||||
|
"predictions_test_A = classifier.predict(counts3)\n",
|
||||||
|
"predictions_test_A.tofile('./test-A/out.tsv', sep='\\n')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.5"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
61
bayes.py
Normal file
61
bayes.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
|
||||||
|
PATHS = ['./train/train.tsv', './dev-0/in.tsv', './test-A/in.tsv']
|
||||||
|
PATHS_OUTPUT = ['./dev-0/out.tsv', './test-A/out.tsv']
|
||||||
|
|
||||||
|
def get_data(path):
|
||||||
|
return pd.read_table(path, error_bad_lines=False, sep='\t', header=None)
|
||||||
|
|
||||||
|
def get_X_y_train(data):
|
||||||
|
X_train = data[1].values
|
||||||
|
y_train = data[0].values
|
||||||
|
|
||||||
|
return X_train, y_train
|
||||||
|
|
||||||
|
def training(x, y):
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
result = vectorizer.fit_transform(x)
|
||||||
|
classifier = MultinomialNB()
|
||||||
|
classifier.fit(result, y)
|
||||||
|
|
||||||
|
return classifier, vectorizer
|
||||||
|
|
||||||
|
|
||||||
|
def predict(vectorizer, classifier, x):
|
||||||
|
result = vectorizer.transform(x)
|
||||||
|
pred = classifier.predict(result)
|
||||||
|
|
||||||
|
return pred
|
||||||
|
|
||||||
|
def generate_output(pred, path):
|
||||||
|
pred.tofile(path, sep = '\n')
|
||||||
|
|
||||||
|
def main():
|
||||||
|
#prepare train
|
||||||
|
train = get_data(PATHS[0])
|
||||||
|
X_train, y_train = get_X_y_train(train)
|
||||||
|
|
||||||
|
#train
|
||||||
|
classifier, vectorizer = training(X_train, y_train)
|
||||||
|
|
||||||
|
#dev
|
||||||
|
X_dev = get_data(PATHS[1])
|
||||||
|
X_dev = X_dev[0].values
|
||||||
|
pred_dev = predict(vectorizer, classifier, X_dev)
|
||||||
|
|
||||||
|
#test
|
||||||
|
X_test = get_data(PATHS[2])
|
||||||
|
X_test = X_test[0].values
|
||||||
|
pred_test = predict(vectorizer, classifier, X_test)
|
||||||
|
|
||||||
|
#generate output
|
||||||
|
generate_output(pred_dev, PATHS_OUTPUT[0])
|
||||||
|
generate_output(pred_test, PATHS_OUTPUT[1])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5445
test-A/out.tsv
Normal file
5445
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
98132
train/train.tsv
Normal file
98132
train/train.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user