Compare commits
No commits in common. "my-brilliant-branch" and "master" have entirely different histories.
my-brillia
...
master
5272
dev-0/out.tsv
5272
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
160
foo.py
160
foo.py
@ -1,160 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf-8
|
|
||||||
|
|
||||||
# In[3]:
|
|
||||||
|
|
||||||
|
|
||||||
import pathlib
|
|
||||||
from collections import Counter
|
|
||||||
from sklearn.metrics import *
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
# In[1]:
|
|
||||||
|
|
||||||
|
|
||||||
import numpy as np, pandas as pd
|
|
||||||
import seaborn as sns
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
from sklearn.datasets import fetch_20newsgroups
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
from sklearn.naive_bayes import MultinomialNB
|
|
||||||
from sklearn.pipeline import make_pipeline
|
|
||||||
from sklearn.metrics import confusion_matrix, accuracy_score
|
|
||||||
sns.set() # use seaborn plotting style
|
|
||||||
|
|
||||||
|
|
||||||
# In[5]:
|
|
||||||
|
|
||||||
|
|
||||||
train_x = pd.read_csv('train/in.tsv', header=None, sep='\t')
|
|
||||||
train_y = pd.read_csv('train/expected.tsv', header=None, sep='\t')
|
|
||||||
dev_x = pd.read_csv('dev-0/in.tsv', header=None, sep='\t')
|
|
||||||
dev_y = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t')
|
|
||||||
test_x = pd.read_csv('test-A/in.tsv', header=None, sep='\t')
|
|
||||||
|
|
||||||
|
|
||||||
# In[61]:
|
|
||||||
|
|
||||||
|
|
||||||
print(dev_y.shape)
|
|
||||||
print(dev_x.shape)
|
|
||||||
|
|
||||||
|
|
||||||
# In[11]:
|
|
||||||
|
|
||||||
|
|
||||||
print(train_x[:15])
|
|
||||||
|
|
||||||
|
|
||||||
# In[27]:
|
|
||||||
|
|
||||||
|
|
||||||
print(train_x.shape)
|
|
||||||
|
|
||||||
|
|
||||||
# In[49]:
|
|
||||||
|
|
||||||
|
|
||||||
print(train_y.shape)
|
|
||||||
|
|
||||||
|
|
||||||
# In[8]:
|
|
||||||
|
|
||||||
|
|
||||||
print(train_y[:15])
|
|
||||||
|
|
||||||
|
|
||||||
# In[53]:
|
|
||||||
|
|
||||||
|
|
||||||
print(dev_x[:4])
|
|
||||||
|
|
||||||
|
|
||||||
# In[119]:
|
|
||||||
|
|
||||||
|
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
|
||||||
from sklearn.feature_extraction.text import TfidfTransformer
|
|
||||||
|
|
||||||
vec = CountVectorizer(stop_words='english')
|
|
||||||
x1 = vec.fit_transform(train_x[:20000][0])
|
|
||||||
tfidf_transformer = TfidfTransformer()
|
|
||||||
|
|
||||||
x1_tf = tfidf_transformer.fit_transform(x1)
|
|
||||||
|
|
||||||
|
|
||||||
# In[120]:
|
|
||||||
|
|
||||||
|
|
||||||
# Build the model
|
|
||||||
#model = make_pipeline(TfidfVectorizer(), MultinomialNB())
|
|
||||||
clf = MultinomialNB().fit(x1_tf, train_y[:20000][0])
|
|
||||||
|
|
||||||
|
|
||||||
# In[121]:
|
|
||||||
|
|
||||||
|
|
||||||
# Train the model using the training data
|
|
||||||
#model.fit(x1[:][0], train_y[:289541][0])
|
|
||||||
# Predict the categories of the test data
|
|
||||||
X_new_counts = vec.transform(dev_x[:][0])
|
|
||||||
# We call transform instead of fit_transform because it's already been fit
|
|
||||||
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
|
|
||||||
#predicted_categories = model.predict(dev_x[:][0])
|
|
||||||
|
|
||||||
|
|
||||||
# In[122]:
|
|
||||||
|
|
||||||
|
|
||||||
predicted = clf.predict(X_new_tfidf)
|
|
||||||
|
|
||||||
|
|
||||||
# In[125]:
|
|
||||||
|
|
||||||
|
|
||||||
print(predicted[:10])
|
|
||||||
|
|
||||||
|
|
||||||
# In[126]:
|
|
||||||
|
|
||||||
|
|
||||||
print(predicted.shape)
|
|
||||||
|
|
||||||
|
|
||||||
# In[123]:
|
|
||||||
|
|
||||||
|
|
||||||
#mat = confusion_matrix(dev_y[:][0],predicted_categories)
|
|
||||||
|
|
||||||
print("The accuracy is {}".format(accuracy_score( dev_y[:][0],predicted_categories)))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# In[124]:
|
|
||||||
|
|
||||||
|
|
||||||
print('We got an accuracy of',np.mean(predicted == dev_y[:][0])*100, '% over the test data.')
|
|
||||||
|
|
||||||
|
|
||||||
# In[130]:
|
|
||||||
|
|
||||||
|
|
||||||
np.savetxt("out.tsv",predicted, delimiter="\t", fmt='%d')
|
|
||||||
|
|
||||||
|
|
||||||
# In[131]:
|
|
||||||
|
|
||||||
|
|
||||||
X_test = vec.transform(test_x[:][0])
|
|
||||||
# We call transform instead of fit_transform because it's already been fit
|
|
||||||
X_tfidf_test = tfidf_transformer.transform(X_test)
|
|
||||||
predicted_test = clf.predict(X_tfidf_test)
|
|
||||||
np.savetxt("out.tsv",predicted_test, delimiter="\t", fmt='%d')
|
|
||||||
|
|
||||||
|
|
||||||
# In[ ]:
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
5152
test-A/out.tsv
5152
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user