#!/usr/bin/env python # coding: utf-8 # In[3]: import pathlib from collections import Counter from sklearn.metrics import * import pandas as pd # In[1]: import numpy as np, pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import make_pipeline from sklearn.metrics import confusion_matrix, accuracy_score sns.set() # use seaborn plotting style # In[5]: train_x = pd.read_csv('train/in.tsv', header=None, sep='\t') train_y = pd.read_csv('train/expected.tsv', header=None, sep='\t') dev_x = pd.read_csv('dev-0/in.tsv', header=None, sep='\t') dev_y = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t') test_x = pd.read_csv('test-A/in.tsv', header=None, sep='\t') # In[61]: print(dev_y.shape) print(dev_x.shape) # In[11]: print(train_x[:15]) # In[27]: print(train_x.shape) # In[49]: print(train_y.shape) # In[8]: print(train_y[:15]) # In[53]: print(dev_x[:4]) # In[119]: from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer vec = CountVectorizer(stop_words='english') x1 = vec.fit_transform(train_x[:20000][0]) tfidf_transformer = TfidfTransformer() x1_tf = tfidf_transformer.fit_transform(x1) # In[120]: # Build the model #model = make_pipeline(TfidfVectorizer(), MultinomialNB()) clf = MultinomialNB().fit(x1_tf, train_y[:20000][0]) # In[121]: # Train the model using the training data #model.fit(x1[:][0], train_y[:289541][0]) # Predict the categories of the test data X_new_counts = vec.transform(dev_x[:][0]) # We call transform instead of fit_transform because it's already been fit X_new_tfidf = tfidf_transformer.transform(X_new_counts) #predicted_categories = model.predict(dev_x[:][0]) # In[122]: predicted = clf.predict(X_new_tfidf) # In[125]: print(predicted[:10]) # In[126]: print(predicted.shape) # In[123]: #mat = confusion_matrix(dev_y[:][0],predicted_categories) print("The accuracy is {}".format(accuracy_score( dev_y[:][0],predicted_categories))) # In[124]: print('We got an accuracy of',np.mean(predicted == dev_y[:][0])*100, '% over the test data.') # In[130]: np.savetxt("out.tsv",predicted, delimiter="\t", fmt='%d') # In[131]: X_test = vec.transform(test_x[:][0]) # We call transform instead of fit_transform because it's already been fit X_tfidf_test = tfidf_transformer.transform(X_test) predicted_test = clf.predict(X_tfidf_test) np.savetxt("out.tsv",predicted_test, delimiter="\t", fmt='%d') # In[ ]: