add bayes
This commit is contained in:
parent
9cb2fb2612
commit
4ea8113b15
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"cells": [],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n",
|
||||
"b'Skipping line 1983: expected 1 fields, saw 2\\nSkipping line 5199: expected 1 fields, saw 2\\n'\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.naive_bayes import GaussianNB\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"\n",
|
||||
"r_in = './train/train.tsv'\n",
|
||||
"\n",
|
||||
"r_ind_ev = './dev-0/in.tsv'\n",
|
||||
"tsv_read = pd.read_table(r_in, error_bad_lines=False, sep='\\t', header=None)\n",
|
||||
"tsv_read_dev = pd.read_table(r_ind_ev, error_bad_lines=False, sep='\\t', header=None)\n",
|
||||
"\n",
|
||||
"y_train = tsv_read[0].values\n",
|
||||
"X_train = tsv_read[1].values\n",
|
||||
"X_dev = tsv_read_dev[0].values\n",
|
||||
"\n",
|
||||
"vectorizer = TfidfVectorizer()\n",
|
||||
"counts = vectorizer.fit_transform(X_train)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"classifier = MultinomialNB()\n",
|
||||
"classifier.fit(counts, y_train)\n",
|
||||
"\n",
|
||||
"counts2 = vectorizer.transform(X_dev)\n",
|
||||
"predictions = classifier.predict(counts2)\n",
|
||||
"\n",
|
||||
"predictions.tofile(\"./dev-0/out.tsv\", sep='\\n')\n",
|
||||
"\n",
|
||||
"tsv_read_test_in = pd.read_table('./test-A/in.tsv', error_bad_lines=False, header= None)\n",
|
||||
"X_test= tsv_read_test_in[0].values\n",
|
||||
"\n",
|
||||
"counts3 = vectorizer.transform(X_test)\n",
|
||||
"predictions_test_A = classifier.predict(counts3)\n",
|
||||
"predictions_test_A.tofile('./test-A/out.tsv', sep='\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
from sklearn.naive_bayes import GaussianNB
|
||||
import pandas as pd
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
PATHS = ['./train/train.tsv', './dev-0/in.tsv', './test-A/in.tsv']
|
||||
PATHS_OUTPUT = ['./dev-0/out.tsv', './test-A/out.tsv']
|
||||
|
||||
def get_data(path):
|
||||
return pd.read_table(path, error_bad_lines=False, sep='\t', header=None)
|
||||
|
||||
def get_X_y_train(data):
|
||||
X_train = data[1].values
|
||||
y_train = data[0].values
|
||||
|
||||
return X_train, y_train
|
||||
|
||||
def training(x, y):
|
||||
vectorizer = TfidfVectorizer()
|
||||
result = vectorizer.fit_transform(x)
|
||||
classifier = MultinomialNB()
|
||||
classifier.fit(result, y)
|
||||
|
||||
return classifier, vectorizer
|
||||
|
||||
|
||||
def predict(vectorizer, classifier, x):
|
||||
result = vectorizer.transform(x)
|
||||
pred = classifier.predict(result)
|
||||
|
||||
return pred
|
||||
|
||||
def generate_output(pred, path):
|
||||
pred.tofile(path, sep = '\n')
|
||||
|
||||
def main():
|
||||
#prepare train
|
||||
train = get_data(PATHS[0])
|
||||
X_train, y_train = get_X_y_train(train)
|
||||
|
||||
#train
|
||||
classifier, vectorizer = training(X_train, y_train)
|
||||
|
||||
#dev
|
||||
X_dev = get_data(PATHS[1])
|
||||
X_dev = X_dev[0].values
|
||||
pred_dev = predict(vectorizer, classifier, X_dev)
|
||||
|
||||
#test
|
||||
X_test = get_data(PATHS[2])
|
||||
X_test = X_test[0].values
|
||||
pred_test = predict(vectorizer, classifier, X_test)
|
||||
|
||||
#generate output
|
||||
generate_output(pred_dev, PATHS_OUTPUT[0])
|
||||
generate_output(pred_test, PATHS_OUTPUT[1])
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue