#!/usr/bin/env python # coding: utf-8 # # zadania domowe naiwny bayes2 gotowa biblioteka # - wybrać jedno z poniższych repozytoriów i je sforkować: # - https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public # - https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public # - stworzyć klasyfikator bazujący na naiwnym bayessie (może być gotowa biblioteka), może też korzystać z gotowych implementacji tfidf # - stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv # - wynik accuracy sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.67 # - proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo # termin 12.05, 40 punktów # # In[1]: import pathlib import gzip import numpy as np import gensim from stop_words import get_stop_words from sklearn.feature_extraction.text import TfidfVectorizer # In[2]: SPORT_TEXT_PATH = pathlib.Path('C:/Users/Fijka/Documents/sport-text-classification-ball-ISI-public') file_name = 'train' # In[3]: def read_data(filename): all_data = gzip.open(filename).read().decode('UTF-8').split('\n') data, expected_class = [], [] for i in [line.split('\t') for line in all_data][:-1]: data.append(i[1]) expected_class.append(i[0]) return data, expected_class train_data, train_clesses = read_data(SPORT_TEXT_PATH/file_name/'train.tsv.gz') train_data, train_clesses = train_data[:20000], train_clesses[:20000] # In[4]: stop_words = get_stop_words('pl') + ['a', 'u', 'i', 'z', 'w', 'o'] print(stop_words) # In[5]: print(train_clesses[0]) print(train_data[0]) # In[6]: train_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in train_data] # In[7]: train_data_tokenized[0] # In[8]: train_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in train_data_tokenized] tmp = [i.sort() for i in train_data_lemmatized] # In[9]: print(train_data_lemmatized[0]) # In[10]: print(train_data_lemmatized[0]) print([' '.join(i) for i in train_data_lemmatized[:2]]) # In[11]: import itertools vectorizer = TfidfVectorizer() X = vectorizer.fit_transform([' '.join(i) for i in train_data_lemmatized]) # In[12]: vocabulary = vectorizer.get_feature_names() # In[13]: from sklearn.naive_bayes import GaussianNB model = GaussianNB() model.fit(X.toarray(), train_clesses) score_train = model.score(X.toarray(), train_clesses) # In[14]: with open('dev-0/in.tsv', "r", encoding="utf-8") as f: dev_0_data = [line.rstrip() for line in f] # In[15]: dev_0_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in dev_0_data] # In[16]: dev_0_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in dev_0_data_tokenized] # In[17]: f = open("dev-0/out.tsv", "a") for i in [' '.join(i) for i in dev_0_data_lemmatized]: f.write(model.predict([vectorizer.transform([i]).toarray()[0]])[0] + '\n') f.close() # In[18]: with open('dev-0/out.tsv', "r", encoding="utf-8") as f: o = [line.rstrip() for line in f] # In[19]: with open('dev-0/expected.tsv', "r", encoding="utf-8") as f: e = [line.rstrip() for line in f] # In[20]: t, f = 0, 0 for i in range(len(o)): if o[i] == e[i]: t += 1 else: f += 1 print(t, f) print(t/(t + f)) # In[21]: with open('test-A/in.tsv', "r", encoding="utf-8") as f: test_A_data = [line.rstrip() for line in f] test_A_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in test_A_data] test_A_data_lemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in test_A_data_tokenized] f = open("test-A/out.tsv", "a") for i in [' '.join(i) for i in test_A_data_lemmatized]: f.write(model.predict([vectorizer.transform([i]).toarray()[0]])[0] + '\n') f.close()