Compare commits

...

10 Commits
master ... w2v

Author SHA1 Message Date
Artur Dylewski
4a86e3878e add w2v 2020-06-03 18:01:38 +02:00
dylodylo
d13443a750 change output 2020-05-02 19:29:47 +02:00
dylodylo
9aea4283bd change output 2020-05-02 19:27:04 +02:00
dylodylo
599d13bf16 change output 2020-05-02 16:26:34 +02:00
dylodylo
f0b5319f41 change output 2020-05-02 15:24:22 +02:00
dylodylo
fa68a0fe33 change output 2020-05-02 13:47:19 +02:00
dylodylo
dafa49e690 add tokenizer 2020-05-02 13:40:22 +02:00
dylodylo
744e5db758 naive-bayess solution 2020-03-29 21:22:20 +02:00
dylodylo
2a9ca866c9 naive-bayess solution 2020-03-29 21:03:04 +02:00
dylodylo
8fd7b62eef naive-bayess solution 2020-03-29 20:58:56 +02:00
5 changed files with 10646 additions and 10466 deletions

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -1,71 +1,150 @@
import csv
from collections import defaultdict
import math
import pickle
import os
from pathlib import Path
counter = 0
docs = []
with open('in.tsv') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
docs.append(row)
counter+=1
print(counter)
pcounter = 0
scounter = 0
with open('expected.tsv') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
if row[0] == " P":
pcounter += 1
if row[0] == " S":
scounter += 1
def tokenize(text):
text = text.replace("n't", " not")
text = text.replace("'s", " is")
text = text.replace("'ve", " have")
text = text.replace("'", " ")
text = text.replace("(", " ")
text = text.replace(")", " ")
text = text.replace("/", " ")
text = text.replace("\\n\\n", "")
text = text.replace(".", "")
text = text.replace("?", "")
text = text.replace(",", "")
text = text.replace("!", "")
text = text.replace('"', '')
text = text.replace(" a ", " ")
text = text.replace(" on ", " ")
text = text.replace(" the ", " ")
text = text.replace(" of ", " ")
text = text.replace(" an ", " ")
text = text.replace(" to ", " ")
#text = text.replace("a", "")
return text
print(pcounter)
print(scounter)
print("P(S) = " + str(scounter+1/counter+2))
print("P(P) = " + str(pcounter+1/counter+2))
def calc_class_logprob(expected_path):
def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))
paranoarmal_class_count = 0
skeptic_class_count = 0
with open(expected_path) as f:
for line in f:
if "P" in line:
if "1" in line:
paranoarmal_class_count +=1
elif "S" in line:
elif "0" in line:
skeptic_class_count +=1
paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count)
skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count)
return math.log(paranormal_class_prob), math.log(skeptic_class_prob)
return paranormal_class_prob, skeptic_class_prob
def calc_word_counts(in_path, expected_path):
with open(in_path), open(expected_path) as in_file, exp_file:
with open(in_path) as in_file, open(expected_path) as exp_file:
word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
for in_line, exp_line in zip(in_file, exp_file):
for line in f:
class_ = exp_line.rstrip('\n').replace(" ", "")
text, timestamp = line.rstrip('\n').split('\t')
tokens = text.lower().split(' ')
for token in tokens:
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
word_counts['skeptic'][token] += 1
class_ = exp_line.rstrip('\n').replace(" ", "")
text, timestamp = in_line.rstrip('\n').split('\t')
text = tokenize(text)
tokens = text.lower().split(' ')
for token in tokens:
if class_ == '1':
word_counts['paranormal'][token] += 1
elif class_ == '0':
word_counts['skeptic'][token] += 1
return word_counts
def calc_words_logprobs(words_counts):
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()))
total_paranormal = sum(word_counts['paranormal'].values() + len(word_counts['paranormal'].keys()))
def calc_word_logprobs(word_counts):
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
word_logprobs = {'paranormal': {}, 'skeptic':{}}
for class_ in word_logprobs.keys():
for token, value in word_counts[class_].items():
if class_ == 'skeptic':
word_prob = (value + 1)/ total_skeptic
else:
word_prob = (value + 1)/total_paranormal
word_logprobs[class_][token] = word_prob
return word_logprobs
paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")
word_counts = calc_word_counts('train/in.tsv','train/expected.tsv')
word_logprobs = calc_word_logprobs(word_counts)
#print(word_logprobs['skeptic']["hair."]) #-12.166205308815476
#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic
def get_test_posts(path):
posts = []
with open(path) as f:
for line in f:
text, timestamp = line.rstrip('\n').split('\t')
posts.append(text)
return posts
# with open('prediction.tsv', 'wt') as tsvfile:
# tsv_writer = csv.writer(tsvfile, delimiter='\t')
# for i in range(counter):
# tsv_writer.writerow('S')
def predict_post_class(posts, sprob, pprob, word_logprobs):
out_classes = []
for post in posts:
total_s_prob = math.log(sprob)
total_p_prob = math.log(pprob)
post = tokenize(post)
tokens = post.lower().split(' ')
for token in tokens:
#dlasceptic
if (token in word_logprobs['skeptic'].keys()):
sceptic_prob = word_logprobs['skeptic'][token]+1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
else:
sceptic_prob = 1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
#dlaparanormal
if (token in word_logprobs['paranormal'].keys()):
paranormal_prob = word_logprobs['paranormal'][token]+1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
else:
paranormal_prob = 1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
total_s_prob += math.log(sceptic_prob)
total_p_prob += math.log(paranormal_prob)
#print(total_p_prob)
#print(total_s_prob)
if total_p_prob > total_s_prob:
out_classes.append(total_p_prob)
else:
out_classes.append(total_s_prob)
return out_classes
def predict_posts(path):
posts = get_test_posts(path+'/in.tsv')
classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs)
with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
# for i in classes:
# tsv_writer.writerow(i)
tsv_writer.writerows(map(lambda x: [-x], classes))
predict_posts("dev-0")
predict_posts("test-A")
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
counter = 0
positive = 0
for out_line, exp_line in zip(out_file, exp_file):
counter+=1
if out_line == exp_line:
positive += 1
print(positive/counter)

File diff suppressed because it is too large Load Diff

101
w2v.py Normal file
View File

@ -0,0 +1,101 @@
import pandas as pd # our main data management package
import string # used for preprocessing
import re # used for preprocessing
import nltk # the Natural Language Toolkit, used for preprocessing
import numpy as np # used for managing NaNs
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords # used for preprocessing
from nltk.stem import WordNetLemmatizer # used for preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression # our model
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
import gensim.models
def remove_urls(text):
new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
return new_text
# make all text lowercase
def text_lowercase(text):
return text.lower()
# remove numbers
def remove_numbers(text):
result = re.sub(r'\d+', '', text)
return result
# remove punctuation
def remove_punctuation(text):
translator = str.maketrans('', '', string.punctuation)
return text.translate(translator)
# tokenize
def tokenize(text):
text = word_tokenize(text)
return text
# remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
text = [i for i in text if not i in stop_words]
return text
# lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
text = [lemmatizer.lemmatize(token) for token in text]
return text
def preprocessing(text):
text = text_lowercase(text)
text = remove_urls(text)
text = remove_numbers(text)
text = remove_punctuation(text)
text = tokenize(text)
text = remove_stopwords(text)
text = lemmatize(text)
#text = ' '.join(text)
return text
def make_posts_list(in_file):
posts = []
with open(in_file) as f:
for line in f:
post = (line.split('\t')[0])
posts.append(preprocessing(post))
return posts
def make_exp_list(exp_file):
exp_list = []
with open(exp_file) as f:
for exp_line in f:
y = int(exp_line)
exp_list.append(y)
return exp_list
tokens = make_posts_list("train/in.tsv")
Y = make_exp_list("train/out.tsv")
model = gensim.models.Word2Vec(tokens, size=300, min_count=1, workers=4)
print("\n Training the word2vec model...\n")
# reducing the epochs will decrease the computation time
model.train(tokens, total_examples=len(tokens), epochs=4000)
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(model.wv.syn0, Y)
tokens = make_posts_list("dev-0/in.tsv")
# Prediction of the first 15 samples of all features
predict = clf.predict(model.wv.syn0[:15, :])
# Calculating the score of the predictions
score = clf.score(model.wv.syn0, Y_dataset[:max_dataset_size])
print("\nPrediction word2vec : \n", predict)
print("Score word2vec : \n", score)