Compare commits

..

2 Commits

Author SHA1 Message Date
Artur Dylewski
2787573a2c poprawki 2020-06-08 15:25:08 +02:00
dylodylo
9cb993d397 test commit from Windows 2020-04-06 15:34:10 +02:00
10 changed files with 579795 additions and 580088 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,119 +0,0 @@
import csv
import re
import random
import json
from math import sqrt
def make_dict(path):
dict = {}
with open(path) as in_file:
for line in in_file:
post = (line.split('\t')[0])
for word in re.findall(r"[\w']+", post):
if not word in dict:
weight = round(random.random()%0.2-0.1,2)
dict[word] = weight
with open('dict.txt', 'w') as file:
json.dump(dict, file)
def make_posts_list(in_file):
posts = []
with open(in_file) as f:
for line in f:
post = (line.split('\t')[0])
posts.append(post)
return posts
def make_exp_list(exp_file):
exp_list = []
with open(exp_file) as f:
for exp_line in f:
y = int(exp_line)
exp_list.append(y)
return exp_list
def train_model(in_path, exp_path):
with open('dict.txt', 'r') as file:
dict = json.load(file)
posts = make_posts_list(in_path)
exp = make_exp_list(exp_path)
w0 = 0.1
lr = 0.00001
loss_counter = 0
loss_sum = 0
last_sum = 10
while loss_counter < 1000:
loss_cost = 0
for in_line, exp_line in zip(posts, exp):
loss_counter+=1
#losowy przykład ze zbioru uczącego
#print("new post" + str(random.randint(0,10)))
post = (in_line.split('\t')[0])
error_rate = 1
y = int(exp_line)
y_hat = w0
for word in re.findall(r"[\w']+", post):
y_hat += dict[word]
loss = (y_hat - y)**2
loss_sum += loss
#uczenie
delta = (y_hat - y) * lr
w0 = w0 - delta
for word in re.findall(r"[\w']+", post):
dict[word] -= delta
real_loss = loss_sum/loss_counter
print(real_loss)
if real_loss > last_sum:
break
else:
last_sum = real_loss
loss_sum = 0
loss_counter = 0
dict["w0"] = w0
with open('dict2.txt', 'w') as file:
json.dump(dict, file)
def predict(path):
results = []
with open('dict2.txt', 'r') as file:
dict = json.load(file)
with open(path+"/in.tsv") as in_file:
for in_line in in_file:
print("new post" + str(random.randint(0,10)))
post = (in_line.split('\t')[0])
y=dict["w0"]
for word in re.findall(r"[\w']+", post):
if word in dict:
y += dict[word]
if y > 0.5:
results.append("1")
else:
results.append("0")
with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
for i in results:
tsv_writer.writerow(i)
make_dict("train/in.tsv")
train_model("train/in.tsv", "train/expected.tsv")
def check_dev():
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
counter = 0
positive = 0
for out_line, exp_line in zip(out_file, exp_file):
counter+=1
if out_line == exp_line:
positive += 1
print(positive/counter)
predict("dev-0")
check_dev()

View File

@ -1,87 +0,0 @@
import csv
import re
import random
import json
# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here']
def make_dict(path):
dict = {}
with open(path) as in_file:
for line in in_file:
post = (line.split('\t')[0])
for word in re.findall(r"[\w']+", post):
if not word in dict:
weight = round(random.random()%0.2-0.1,2)
dict[word] = weight
return dict
def train_model(in_path, exp_path):
dict = make_dict(in_path)
w0 = 0.1
lr = 0.0001
with open(in_path) as in_file, open(exp_path) as exp_file:
for in_line, exp_line in zip(in_file, exp_file):
print("new post" + str(random.randint(0,10)))
post = (in_line.split('\t')[0])
delta = 1
y=0
y_plus = 0
y_minus = 0
while delta > 0.5:
for word in re.findall(r"[\w']+", post):
y += dict[word]
y_plus += dict[word] + lr
y_minus += dict[word] - lr
delta = abs(int(exp_line) - y+w0)
delta_minus = abs(int(exp_line) - y_minus+w0)
delta_plus = abs(int(exp_line) - y_plus+w0)
if delta_minus < delta:
delta = delta_minus
for word in re.findall(r"[\w']+", post):
dict[word] = dict[word] - lr
elif delta_plus < delta:
delta = delta_plus
for word in re.findall(r"[\w']+", post):
dict[word] = dict[word] + lr
else:
break
with open('dict.txt', 'w') as file:
json.dump(dict, file)
def predict(path):
results = []
with open('dict.txt', 'r') as file:
dict = json.load(file)
with open(path+"/in.tsv") as in_file:
for in_line in in_file:
print("new post" + str(random.randint(0,10)))
post = (in_line.split('\t')[0])
y=0
for word in re.findall(r"[\w']+", post):
if word in dict:
y += dict[word]
if y > 0.5:
results.append("1")
else:
results.append("0")
with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
for i in results:
tsv_writer.writerow(i)
predict("test-A")
def check_dev():
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
counter = 0
positive = 0
for out_line, exp_line in zip(out_file, exp_file):
counter+=1
if out_line == exp_line:
positive += 1
print(positive/counter)

View File

@ -1,42 +0,0 @@
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import csv
def get_test_posts(path):
posts = []
with open(path) as f:
for line in f:
text, timestamp = line.rstrip('\n').split('\t')
posts.append(text)
return posts
def get_expected(path):
expected = []
with open(path) as f:
for line in f:
class_ = line.rstrip('\n').replace(" ", "")
expected.append(class_)
return expected
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(get_test_posts("train/in.tsv"))
y = get_expected("train/expected.tsv")
clf = MultinomialNB()
clf.fit(X_train_counts, y)
def predict_posts(path, clf):
X = count_vect.transform(get_test_posts(path+'/in.tsv'))
classes = clf.predict(X)
with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
for i in classes:
tsv_writer.writerow(i)
predict_posts("dev-0", clf)
predict_posts("test-A", clf)

View File

@ -1,11 +1,33 @@
import csv
from collections import defaultdict
import math
import pickle
import os
from pathlib import Path
def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))
counter = 0
docs = []
with open('in.tsv') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
docs.append(row)
counter+=1
print(counter)
pcounter = 0
scounter = 0
with open('expected.tsv') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
if row[0] == " P":
pcounter += 1
if row[0] == " S":
scounter += 1
print(pcounter)
print(scounter)
print("P(S) = " + str(scounter+1/counter+2))
print("P(P) = " + str(pcounter+1/counter+2))
def calc_class_logprob(expected_path):
paranoarmal_class_count = 0
skeptic_class_count = 0
with open(expected_path) as f:
@ -21,100 +43,29 @@ def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla
return math.log(paranormal_class_prob), math.log(skeptic_class_prob)
def calc_word_counts(in_path, expected_path):
with open(in_path) as in_file, open(expected_path) as exp_file:
with open(in_path), open(expected_path) as in_file, exp_file:
word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
for in_line, exp_line in zip(in_file, exp_file):
class_ = exp_line.rstrip('\n').replace(" ", "")
text, timestamp = in_line.rstrip('\n').split('\t')
tokens = text.lower().split(' ')
for token in tokens:
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
word_counts['skeptic'][token] += 1
for line in f:
class_ = exp_line.rstrip('\n').replace(" ", "")
text, timestamp = line.rstrip('\n').split('\t')
tokens = text.lower().split(' ')
for token in tokens:
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
word_counts['skeptic'][token] += 1
return word_counts
def calc_word_logprobs(word_counts):
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
word_logprobs = {'paranormal': {}, 'skeptic':{}}
for class_ in word_logprobs.keys():
for token, value in word_counts[class_].items():
if class_ == 'skeptic':
word_prob = (value + 1)/ total_skeptic
else:
word_prob = (value + 1)/total_paranormal
word_logprobs[class_][token] = math.log(word_prob)
return word_logprobs
paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")
word_counts = calc_word_counts('train/in.tsv','train/expected.tsv')
word_logprobs = calc_word_logprobs(word_counts)
print(word_logprobs['skeptic']["hair."]) #-12.166205308815476
#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic
def get_test_posts(path):
posts = []
with open(path) as f:
for line in f:
text, timestamp = line.rstrip('\n').split('\t')
posts.append(text)
return posts
def calc_words_logprobs(words_counts):
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()))
total_paranormal = sum(word_counts['paranormal'].values() + len(word_counts['paranormal'].keys()))
def predict_post_class(posts, sprob, pprob, word_logprobs):
out_classes = []
with open('prediction.tsv', 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
for i in range(counter):
tsv_writer.writerow('S')
for post in posts:
total_s_prob = sprob
total_p_prob = pprob
tokens = post.lower().split(' ')
for token in tokens:
#dlasceptic
if (token in word_logprobs['skeptic'].keys()):
sceptic_prob = word_logprobs['skeptic'][token]
else:
sceptic_prob = 0
#dlaparanormal
if (token in word_logprobs['paranormal'].keys()):
paranormal_prob = word_logprobs['paranormal'][token]
else:
paranormal_prob = 0
total_s_prob += sceptic_prob
total_p_prob += paranormal_prob
#print(total_p_prob)
#print(total_s_prob)
if total_p_prob > total_s_prob:
out_classes.append('P')
else:
out_classes.append('S')
return out_classes
def predict_posts(path):
posts = get_test_posts(path+'/in.tsv')
classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs)
with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
for i in classes:
tsv_writer.writerow(i)
predict_posts("dev-0")
predict_posts("test-A")
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
counter = 0
positive = 0
for out_line, exp_line in zip(out_file, exp_file):
counter+=1
if " "+out_line == exp_line:
positive += 1
print(positive/counter)

View File

@ -1,17 +1,22 @@
import csv
import re
def makeoutput(infile, outfile):
counter = 0
output = []
regex = r'paranormal|ufo|youtube|spirit'
with open(infile) as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
counter+=1
if re.search(regex, str(row).lower()):
output.append('1')
else:
output.append('0')
with open(outfile, 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
for i in range(counter):
tsv_writer.writerow('S')
for i in output:
tsv_writer.writerow(i)
makeoutput("test-A/in.tsv", "test-A/out.tsv")
makeoutput("train/in.tsv", "train/out.tsv")

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

579082
train/out.tsv

File diff suppressed because it is too large Load Diff