#!/usr/bin/env python # coding: utf-8 # In[171]: import pandas as pd from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.naive_bayes import MultinomialNB # In[172]: f = open("train\in.tsv", "r", encoding='utf-8') lines = f.readlines() x_train = pd.DataFrame(lines) x_train.rename(columns = {0 : 'text'}, inplace = True) # In[173]: y_train = pd.read_csv('train\expected.tsv', sep='\t', names=['paranormal'], encoding='utf-8') # In[174]: tfidf_vectorizer=TfidfVectorizer(max_df=0.95, max_features=500) tfidf_vectorizer.fit_transform(x_train['text'].values) x_train_prepared = tfidf_vectorizer.transform(x_train['text'].values).toarray() # In[175]: mnb = MultinomialNB() model_mnb = mnb.fit(x_train_prepared, y_train.values.ravel()) # In[176]: f = open("dev-0\in.tsv", "r", encoding='utf-8') lines = f.readlines() x_dev = pd.DataFrame(lines) x_dev.rename(columns = {0 : 'text'}, inplace = True) x_dev_prepared = tfidf_vectorizer.transform(x_dev['text'].values).toarray() y_dev = pd.read_csv('dev-0\expected.tsv', sep='\t', names=['paranormal'], encoding='utf-8') y_dev_pred = model_mnb.predict(x_dev_prepared) # In[177]: file = open('dev-0\out.tsv', 'w') for y in y_dev_pred: file.write(f'{y}\n') file.close() # In[179]: f = open("test-A\in.tsv", "r", encoding='utf-8') lines = f.readlines() x_test = pd.DataFrame(lines) x_test.rename(columns = {0 : 'text'}, inplace = True) x_test_prepared = tfidf_vectorizer.transform(x_test['text'].values).toarray() y_test_pred = model_mnb.predict(x_test_prepared) # In[180]: file = open('test-A\out.tsv', 'w') for y in y_test_pred: file.write(f'{y}\n') file.close()