4.1 KiB
4.1 KiB
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from sklearn.model_selection import train_test_split
data=pd.read_csv('dev-0/in.tsv', sep='\t', header=None)
expected_data=pd.read_csv('dev-0/expected.tsv', sep='\t', header=None)
data[0] = data[0].str.lower()
filtered_words = [word for word in data[0] if word not in get_stop_words('polish')]
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(data[0])
text_counts
<1x5048 sparse matrix of type '<class 'numpy.int64'>' with 234 stored elements in Compressed Sparse Row format>
X_train, X_test, y_train, y_test = train_test_split(
text_counts, expected_data[0], test_size=0.3, random_state=1)
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))
MultinomialNB Accuracy: 0.6296296296296297
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(filtered_words)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
text_tf, expected_data[0], test_size=0.3, random_state=123)
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))
MultinomialNB Accuracy: 0.2222222222222222