update
This commit is contained in:
parent
756ef4277a
commit
8967a904f8
103
.ipynb_checkpoints/Untitled-checkpoint.ipynb
Normal file
103
.ipynb_checkpoints/Untitled-checkpoint.ipynb
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "NameError",
|
||||||
|
"evalue": "name 'text_data' is not defined",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[0;32m<ipython-input-3-d179e01d96de>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mw2v_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWord2Vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m300\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmin_count\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwindow\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mw2v_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'word'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[0;31mNameError\u001b[0m: name 'text_data' is not defined"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from nltk.tokenize import sent_tokenize, word_tokenize\n",
|
||||||
|
"import warnings\n",
|
||||||
|
" \n",
|
||||||
|
"warnings.filterwarnings(action = 'ignore')\n",
|
||||||
|
" \n",
|
||||||
|
"import gensim\n",
|
||||||
|
"from gensim.models import Word2Vec\n",
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"from sklearn.datasets import load_iris\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"from sklearn.naive_bayes import GaussianNB\n",
|
||||||
|
"\n",
|
||||||
|
"sample = open(\"/train/in.tsv\", \"r\")\n",
|
||||||
|
"s = sample.read()\n",
|
||||||
|
" \n",
|
||||||
|
"# Replaces escape character with space\n",
|
||||||
|
"f = s.replace(\"\\n\", \" \")\n",
|
||||||
|
" \n",
|
||||||
|
"data = []\n",
|
||||||
|
" \n",
|
||||||
|
"# iterate through each sentence in the file\n",
|
||||||
|
"for i in sent_tokenize(f):\n",
|
||||||
|
" temp = []\n",
|
||||||
|
" \n",
|
||||||
|
" # tokenize the sentence into words\n",
|
||||||
|
" for j in word_tokenize(i):\n",
|
||||||
|
" temp.append(j.lower())\n",
|
||||||
|
" \n",
|
||||||
|
" data.append(temp)\n",
|
||||||
|
" \n",
|
||||||
|
"# Create CBOW model\n",
|
||||||
|
"model1 = gensim.models.Word2Vec(data, min_count = 1, \n",
|
||||||
|
" size = 100, window = 5)\n",
|
||||||
|
" \n",
|
||||||
|
"w2v_model = gensim.models.Word2Vec(text_data, size=300, min_count=1, window=5, iter=50)\n",
|
||||||
|
"w2v_model.wv['word']\n",
|
||||||
|
"\n",
|
||||||
|
"with open(\"train/in.tsv\") as f:\n",
|
||||||
|
" content = f.readlines()\n",
|
||||||
|
" with open(\"train/expected.tsv\") as ff:\n",
|
||||||
|
" y = ff.readlines()\n",
|
||||||
|
" vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = False)\n",
|
||||||
|
" vectorizer = TfidfVectorizer()\n",
|
||||||
|
" x = vectorizer.fit_transform(content)\n",
|
||||||
|
" x=x.toarray()\n",
|
||||||
|
" y=y.toarray()\n",
|
||||||
|
" model = GaussianNB()\n",
|
||||||
|
" model.fit(x,y)\n",
|
||||||
|
" y_pred = model.predict([[0,1]])\n",
|
||||||
|
" print(y_pred)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
5272
dev-0/in.tsv
Normal file
5272
dev-0/in.tsv
Normal file
File diff suppressed because one or more lines are too long
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
30
program.py
Normal file
30
program.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
import numpy as np
|
||||||
|
from sklearn import preprocessing
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
le=preprocessing.LabelEncoder()
|
||||||
|
|
||||||
|
with open("train/in.tsv") as f:
|
||||||
|
data = f.readlines()
|
||||||
|
vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = False)
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
x = vectorizer.fit_transform(data)
|
||||||
|
X=x.toarray()
|
||||||
|
with open("train/expected.tsv") as ff:
|
||||||
|
Y = ff.readlines()
|
||||||
|
Y=le.fit_transform(Y)
|
||||||
|
with open("dev-0/in.tsv") as d:
|
||||||
|
fil = d.readlines()
|
||||||
|
vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf = False)
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
r=vectorizer.fit_transform(fil)
|
||||||
|
r=r.toarray()
|
||||||
|
r=r.reshape(-1,1)
|
||||||
|
gnb = GaussianNB()
|
||||||
|
model=gnb.fit(X, Y)
|
||||||
|
y_pred=model.predict(X)
|
||||||
|
print(y_pred)
|
||||||
|
y_pred=np.array(y_pred)
|
||||||
|
t=np.array2string(y_pred, precision=2, separator='\n',suppress_small=True)
|
||||||
|
f = open("dev-0/out.tsv", "a")
|
||||||
|
f.write(t)
|
5152
test-A/in.tsv
Normal file
5152
test-A/in.tsv
Normal file
File diff suppressed because one or more lines are too long
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
289579
train/ex.tsv
Normal file
289579
train/ex.tsv
Normal file
File diff suppressed because it is too large
Load Diff
284307
train/expected.tsv
284307
train/expected.tsv
File diff suppressed because it is too large
Load Diff
5272
train/in.tsv
Normal file
5272
train/in.tsv
Normal file
File diff suppressed because one or more lines are too long
289579
train/innn.tsv
Normal file
289579
train/innn.tsv
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user