Compare commits

...

9 Commits

Author SHA1 Message Date
dylodylo
6eb349dc94 better code 2020-04-20 16:14:42 +02:00
dylodylo
ef7d13af8b updated code, much better 2020-04-14 14:18:59 +02:00
dylodylo
ee6858ae3f add linear regression solution 2020-04-05 16:28:00 +02:00
dylodylo
99d9e8ddb5 v2.0 2020-03-30 23:56:52 +02:00
dylodylo
45f8f65f6c commit 2020-03-30 23:04:51 +02:00
dylodylo
a1f496054d commit 2020-03-30 22:59:26 +02:00
dylodylo
744e5db758 naive-bayess solution 2020-03-29 21:22:20 +02:00
dylodylo
2a9ca866c9 naive-bayess solution 2020-03-29 21:03:04 +02:00
dylodylo
8fd7b62eef naive-bayess solution 2020-03-29 20:58:56 +02:00
8 changed files with 16037 additions and 15739 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1
dict.txt Normal file

File diff suppressed because one or more lines are too long

119
linear_regression.py Normal file
View File

@ -0,0 +1,119 @@
import csv
import re
import random
import json
from math import sqrt
def make_dict(path):
dict = {}
with open(path) as in_file:
for line in in_file:
post = (line.split('\t')[0])
for word in re.findall(r"[\w']+", post):
if not word in dict:
weight = round(random.random()%0.2-0.1,2)
dict[word] = weight
with open('dict.txt', 'w') as file:
json.dump(dict, file)
def make_posts_list(in_file):
posts = []
with open(in_file) as f:
for line in f:
post = (line.split('\t')[0])
posts.append(post)
return posts
def make_exp_list(exp_file):
exp_list = []
with open(exp_file) as f:
for exp_line in f:
y = int(exp_line)
exp_list.append(y)
return exp_list
def train_model(in_path, exp_path):
with open('dict.txt', 'r') as file:
dict = json.load(file)
posts = make_posts_list(in_path)
exp = make_exp_list(exp_path)
w0 = 0.1
lr = 0.00001
loss_counter = 0
loss_sum = 0
last_sum = 10
while loss_counter < 1000:
loss_cost = 0
for in_line, exp_line in zip(posts, exp):
loss_counter+=1
#losowy przykład ze zbioru uczącego
#print("new post" + str(random.randint(0,10)))
post = (in_line.split('\t')[0])
error_rate = 1
y = int(exp_line)
y_hat = w0
for word in re.findall(r"[\w']+", post):
y_hat += dict[word]
loss = (y_hat - y)**2
loss_sum += loss
#uczenie
delta = (y_hat - y) * lr
w0 = w0 - delta
for word in re.findall(r"[\w']+", post):
dict[word] -= delta
real_loss = loss_sum/loss_counter
print(real_loss)
if real_loss > last_sum:
break
else:
last_sum = real_loss
loss_sum = 0
loss_counter = 0
dict["w0"] = w0
with open('dict2.txt', 'w') as file:
json.dump(dict, file)
def predict(path):
results = []
with open('dict2.txt', 'r') as file:
dict = json.load(file)
with open(path+"/in.tsv") as in_file:
for in_line in in_file:
print("new post" + str(random.randint(0,10)))
post = (in_line.split('\t')[0])
y=dict["w0"]
for word in re.findall(r"[\w']+", post):
if word in dict:
y += dict[word]
if y > 0.5:
results.append("1")
else:
results.append("0")
with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
for i in results:
tsv_writer.writerow(i)
make_dict("train/in.tsv")
train_model("train/in.tsv", "train/expected.tsv")
def check_dev():
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
counter = 0
positive = 0
for out_line, exp_line in zip(out_file, exp_file):
counter+=1
if out_line == exp_line:
positive += 1
print(positive/counter)
predict("dev-0")
check_dev()

87
linearregression.py Normal file
View File

@ -0,0 +1,87 @@
import csv
import re
import random
import json
# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here']
def make_dict(path):
dict = {}
with open(path) as in_file:
for line in in_file:
post = (line.split('\t')[0])
for word in re.findall(r"[\w']+", post):
if not word in dict:
weight = round(random.random()%0.2-0.1,2)
dict[word] = weight
return dict
def train_model(in_path, exp_path):
dict = make_dict(in_path)
w0 = 0.1
lr = 0.0001
with open(in_path) as in_file, open(exp_path) as exp_file:
for in_line, exp_line in zip(in_file, exp_file):
print("new post" + str(random.randint(0,10)))
post = (in_line.split('\t')[0])
delta = 1
y=0
y_plus = 0
y_minus = 0
while delta > 0.5:
for word in re.findall(r"[\w']+", post):
y += dict[word]
y_plus += dict[word] + lr
y_minus += dict[word] - lr
delta = abs(int(exp_line) - y+w0)
delta_minus = abs(int(exp_line) - y_minus+w0)
delta_plus = abs(int(exp_line) - y_plus+w0)
if delta_minus < delta:
delta = delta_minus
for word in re.findall(r"[\w']+", post):
dict[word] = dict[word] - lr
elif delta_plus < delta:
delta = delta_plus
for word in re.findall(r"[\w']+", post):
dict[word] = dict[word] + lr
else:
break
with open('dict.txt', 'w') as file:
json.dump(dict, file)
def predict(path):
results = []
with open('dict.txt', 'r') as file:
dict = json.load(file)
with open(path+"/in.tsv") as in_file:
for in_line in in_file:
print("new post" + str(random.randint(0,10)))
post = (in_line.split('\t')[0])
y=0
for word in re.findall(r"[\w']+", post):
if word in dict:
y += dict[word]
if y > 0.5:
results.append("1")
else:
results.append("0")
with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
for i in results:
tsv_writer.writerow(i)
predict("test-A")
def check_dev():
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
counter = 0
positive = 0
for out_line, exp_line in zip(out_file, exp_file):
counter+=1
if out_line == exp_line:
positive += 1
print(positive/counter)

42
readymadesolution.py Normal file
View File

@ -0,0 +1,42 @@
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import csv
def get_test_posts(path):
posts = []
with open(path) as f:
for line in f:
text, timestamp = line.rstrip('\n').split('\t')
posts.append(text)
return posts
def get_expected(path):
expected = []
with open(path) as f:
for line in f:
class_ = line.rstrip('\n').replace(" ", "")
expected.append(class_)
return expected
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(get_test_posts("train/in.tsv"))
y = get_expected("train/expected.tsv")
clf = MultinomialNB()
clf.fit(X_train_counts, y)
def predict_posts(path, clf):
X = count_vect.transform(get_test_posts(path+'/in.tsv'))
classes = clf.predict(X)
with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
for i in classes:
tsv_writer.writerow(i)
predict_posts("dev-0", clf)
predict_posts("test-A", clf)

View File

@ -1,33 +1,11 @@
import csv
from collections import defaultdict
import math
import pickle
import os
from pathlib import Path
counter = 0
docs = []
with open('in.tsv') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
docs.append(row)
counter+=1
print(counter)
pcounter = 0
scounter = 0
with open('expected.tsv') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
if row[0] == " P":
pcounter += 1
if row[0] == " S":
scounter += 1
print(pcounter)
print(scounter)
print("P(S) = " + str(scounter+1/counter+2))
print("P(P) = " + str(pcounter+1/counter+2))
def calc_class_logprob(expected_path):
def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))
paranoarmal_class_count = 0
skeptic_class_count = 0
with open(expected_path) as f:
@ -43,29 +21,100 @@ def calc_class_logprob(expected_path):
return math.log(paranormal_class_prob), math.log(skeptic_class_prob)
def calc_word_counts(in_path, expected_path):
with open(in_path), open(expected_path) as in_file, exp_file:
with open(in_path) as in_file, open(expected_path) as exp_file:
word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
for in_line, exp_line in zip(in_file, exp_file):
for line in f:
class_ = exp_line.rstrip('\n').replace(" ", "")
text, timestamp = line.rstrip('\n').split('\t')
tokens = text.lower().split(' ')
for token in tokens:
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
word_counts['skeptic'][token] += 1
class_ = exp_line.rstrip('\n').replace(" ", "")
text, timestamp = in_line.rstrip('\n').split('\t')
tokens = text.lower().split(' ')
for token in tokens:
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
word_counts['skeptic'][token] += 1
return word_counts
def calc_words_logprobs(words_counts):
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()))
total_paranormal = sum(word_counts['paranormal'].values() + len(word_counts['paranormal'].keys()))
def calc_word_logprobs(word_counts):
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
word_logprobs = {'paranormal': {}, 'skeptic':{}}
for class_ in word_logprobs.keys():
for token, value in word_counts[class_].items():
if class_ == 'skeptic':
word_prob = (value + 1)/ total_skeptic
else:
word_prob = (value + 1)/total_paranormal
word_logprobs[class_][token] = math.log(word_prob)
return word_logprobs
paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")
word_counts = calc_word_counts('train/in.tsv','train/expected.tsv')
word_logprobs = calc_word_logprobs(word_counts)
print(word_logprobs['skeptic']["hair."]) #-12.166205308815476
#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic
def get_test_posts(path):
posts = []
with open(path) as f:
for line in f:
text, timestamp = line.rstrip('\n').split('\t')
posts.append(text)
return posts
# with open('prediction.tsv', 'wt') as tsvfile:
# tsv_writer = csv.writer(tsvfile, delimiter='\t')
# for i in range(counter):
# tsv_writer.writerow('S')
def predict_post_class(posts, sprob, pprob, word_logprobs):
out_classes = []
for post in posts:
total_s_prob = sprob
total_p_prob = pprob
tokens = post.lower().split(' ')
for token in tokens:
#dlasceptic
if (token in word_logprobs['skeptic'].keys()):
sceptic_prob = word_logprobs['skeptic'][token]
else:
sceptic_prob = 0
#dlaparanormal
if (token in word_logprobs['paranormal'].keys()):
paranormal_prob = word_logprobs['paranormal'][token]
else:
paranormal_prob = 0
total_s_prob += sceptic_prob
total_p_prob += paranormal_prob
#print(total_p_prob)
#print(total_s_prob)
if total_p_prob > total_s_prob:
out_classes.append('P')
else:
out_classes.append('S')
return out_classes
def predict_posts(path):
posts = get_test_posts(path+'/in.tsv')
classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs)
with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
for i in classes:
tsv_writer.writerow(i)
predict_posts("dev-0")
predict_posts("test-A")
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
counter = 0
positive = 0
for out_line, exp_line in zip(out_file, exp_file):
counter+=1
if " "+out_line == exp_line:
positive += 1
print(positive/counter)

File diff suppressed because it is too large Load Diff