Compare commits
2 Commits
linear-reg
...
master
Author | SHA1 | Date | |
---|---|---|---|
|
2787573a2c | ||
|
9cb993d397 |
594
dev-0/out.tsv
594
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -1,119 +0,0 @@
|
||||
import csv
|
||||
import re
|
||||
import random
|
||||
import json
|
||||
from math import sqrt
|
||||
|
||||
def make_dict(path):
|
||||
dict = {}
|
||||
with open(path) as in_file:
|
||||
for line in in_file:
|
||||
post = (line.split('\t')[0])
|
||||
for word in re.findall(r"[\w']+", post):
|
||||
if not word in dict:
|
||||
weight = round(random.random()%0.2-0.1,2)
|
||||
dict[word] = weight
|
||||
with open('dict.txt', 'w') as file:
|
||||
json.dump(dict, file)
|
||||
|
||||
def make_posts_list(in_file):
|
||||
posts = []
|
||||
with open(in_file) as f:
|
||||
for line in f:
|
||||
post = (line.split('\t')[0])
|
||||
posts.append(post)
|
||||
return posts
|
||||
|
||||
def make_exp_list(exp_file):
|
||||
exp_list = []
|
||||
with open(exp_file) as f:
|
||||
for exp_line in f:
|
||||
y = int(exp_line)
|
||||
exp_list.append(y)
|
||||
|
||||
return exp_list
|
||||
|
||||
def train_model(in_path, exp_path):
|
||||
with open('dict.txt', 'r') as file:
|
||||
dict = json.load(file)
|
||||
posts = make_posts_list(in_path)
|
||||
exp = make_exp_list(exp_path)
|
||||
w0 = 0.1
|
||||
lr = 0.00001
|
||||
loss_counter = 0
|
||||
loss_sum = 0
|
||||
last_sum = 10
|
||||
while loss_counter < 1000:
|
||||
|
||||
loss_cost = 0
|
||||
for in_line, exp_line in zip(posts, exp):
|
||||
loss_counter+=1
|
||||
#losowy przykład ze zbioru uczącego
|
||||
#print("new post" + str(random.randint(0,10)))
|
||||
post = (in_line.split('\t')[0])
|
||||
error_rate = 1
|
||||
y = int(exp_line)
|
||||
y_hat = w0
|
||||
for word in re.findall(r"[\w']+", post):
|
||||
y_hat += dict[word]
|
||||
loss = (y_hat - y)**2
|
||||
loss_sum += loss
|
||||
|
||||
#uczenie
|
||||
delta = (y_hat - y) * lr
|
||||
w0 = w0 - delta
|
||||
for word in re.findall(r"[\w']+", post):
|
||||
dict[word] -= delta
|
||||
|
||||
|
||||
real_loss = loss_sum/loss_counter
|
||||
print(real_loss)
|
||||
|
||||
if real_loss > last_sum:
|
||||
break
|
||||
else:
|
||||
last_sum = real_loss
|
||||
loss_sum = 0
|
||||
loss_counter = 0
|
||||
dict["w0"] = w0
|
||||
with open('dict2.txt', 'w') as file:
|
||||
json.dump(dict, file)
|
||||
|
||||
def predict(path):
|
||||
results = []
|
||||
with open('dict2.txt', 'r') as file:
|
||||
dict = json.load(file)
|
||||
|
||||
with open(path+"/in.tsv") as in_file:
|
||||
for in_line in in_file:
|
||||
print("new post" + str(random.randint(0,10)))
|
||||
post = (in_line.split('\t')[0])
|
||||
y=dict["w0"]
|
||||
for word in re.findall(r"[\w']+", post):
|
||||
if word in dict:
|
||||
y += dict[word]
|
||||
if y > 0.5:
|
||||
results.append("1")
|
||||
else:
|
||||
results.append("0")
|
||||
|
||||
with open(path+"/out.tsv", 'wt') as tsvfile:
|
||||
tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
||||
for i in results:
|
||||
tsv_writer.writerow(i)
|
||||
|
||||
make_dict("train/in.tsv")
|
||||
train_model("train/in.tsv", "train/expected.tsv")
|
||||
|
||||
def check_dev():
|
||||
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
|
||||
counter = 0
|
||||
positive = 0
|
||||
for out_line, exp_line in zip(out_file, exp_file):
|
||||
counter+=1
|
||||
if out_line == exp_line:
|
||||
positive += 1
|
||||
print(positive/counter)
|
||||
|
||||
predict("dev-0")
|
||||
check_dev()
|
@ -1,87 +0,0 @@
|
||||
import csv
|
||||
import re
|
||||
import random
|
||||
import json
|
||||
|
||||
# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here']
|
||||
def make_dict(path):
|
||||
dict = {}
|
||||
with open(path) as in_file:
|
||||
for line in in_file:
|
||||
post = (line.split('\t')[0])
|
||||
for word in re.findall(r"[\w']+", post):
|
||||
if not word in dict:
|
||||
weight = round(random.random()%0.2-0.1,2)
|
||||
dict[word] = weight
|
||||
|
||||
return dict
|
||||
|
||||
def train_model(in_path, exp_path):
|
||||
dict = make_dict(in_path)
|
||||
w0 = 0.1
|
||||
lr = 0.0001
|
||||
with open(in_path) as in_file, open(exp_path) as exp_file:
|
||||
for in_line, exp_line in zip(in_file, exp_file):
|
||||
print("new post" + str(random.randint(0,10)))
|
||||
post = (in_line.split('\t')[0])
|
||||
delta = 1
|
||||
y=0
|
||||
y_plus = 0
|
||||
y_minus = 0
|
||||
while delta > 0.5:
|
||||
for word in re.findall(r"[\w']+", post):
|
||||
y += dict[word]
|
||||
y_plus += dict[word] + lr
|
||||
y_minus += dict[word] - lr
|
||||
delta = abs(int(exp_line) - y+w0)
|
||||
delta_minus = abs(int(exp_line) - y_minus+w0)
|
||||
delta_plus = abs(int(exp_line) - y_plus+w0)
|
||||
if delta_minus < delta:
|
||||
delta = delta_minus
|
||||
for word in re.findall(r"[\w']+", post):
|
||||
dict[word] = dict[word] - lr
|
||||
elif delta_plus < delta:
|
||||
delta = delta_plus
|
||||
for word in re.findall(r"[\w']+", post):
|
||||
dict[word] = dict[word] + lr
|
||||
else:
|
||||
break
|
||||
with open('dict.txt', 'w') as file:
|
||||
json.dump(dict, file)
|
||||
|
||||
def predict(path):
|
||||
results = []
|
||||
with open('dict.txt', 'r') as file:
|
||||
dict = json.load(file)
|
||||
|
||||
with open(path+"/in.tsv") as in_file:
|
||||
for in_line in in_file:
|
||||
print("new post" + str(random.randint(0,10)))
|
||||
post = (in_line.split('\t')[0])
|
||||
y=0
|
||||
for word in re.findall(r"[\w']+", post):
|
||||
if word in dict:
|
||||
y += dict[word]
|
||||
if y > 0.5:
|
||||
results.append("1")
|
||||
else:
|
||||
results.append("0")
|
||||
|
||||
with open(path+"/out.tsv", 'wt') as tsvfile:
|
||||
tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
||||
for i in results:
|
||||
tsv_writer.writerow(i)
|
||||
|
||||
|
||||
predict("test-A")
|
||||
|
||||
def check_dev():
|
||||
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
|
||||
counter = 0
|
||||
positive = 0
|
||||
for out_line, exp_line in zip(out_file, exp_file):
|
||||
counter+=1
|
||||
if out_line == exp_line:
|
||||
positive += 1
|
||||
print(positive/counter)
|
||||
|
@ -1,42 +0,0 @@
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_extraction.text import TfidfTransformer
|
||||
import csv
|
||||
|
||||
|
||||
def get_test_posts(path):
|
||||
posts = []
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
text, timestamp = line.rstrip('\n').split('\t')
|
||||
posts.append(text)
|
||||
return posts
|
||||
|
||||
|
||||
def get_expected(path):
|
||||
expected = []
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
class_ = line.rstrip('\n').replace(" ", "")
|
||||
expected.append(class_)
|
||||
return expected
|
||||
|
||||
count_vect = CountVectorizer()
|
||||
X_train_counts = count_vect.fit_transform(get_test_posts("train/in.tsv"))
|
||||
|
||||
y = get_expected("train/expected.tsv")
|
||||
|
||||
clf = MultinomialNB()
|
||||
clf.fit(X_train_counts, y)
|
||||
|
||||
def predict_posts(path, clf):
|
||||
X = count_vect.transform(get_test_posts(path+'/in.tsv'))
|
||||
classes = clf.predict(X)
|
||||
with open(path+"/out.tsv", 'wt') as tsvfile:
|
||||
tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
||||
for i in classes:
|
||||
tsv_writer.writerow(i)
|
||||
|
||||
predict_posts("dev-0", clf)
|
||||
predict_posts("test-A", clf)
|
||||
|
135
solution.py
135
solution.py
@ -1,11 +1,33 @@
|
||||
import csv
|
||||
from collections import defaultdict
|
||||
import math
|
||||
import pickle
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))
|
||||
counter = 0
|
||||
docs = []
|
||||
with open('in.tsv') as tsvfile:
|
||||
reader = csv.reader(tsvfile, delimiter='\t')
|
||||
for row in reader:
|
||||
docs.append(row)
|
||||
counter+=1
|
||||
|
||||
print(counter)
|
||||
pcounter = 0
|
||||
scounter = 0
|
||||
with open('expected.tsv') as tsvfile:
|
||||
reader = csv.reader(tsvfile, delimiter='\t')
|
||||
for row in reader:
|
||||
if row[0] == " P":
|
||||
pcounter += 1
|
||||
if row[0] == " S":
|
||||
scounter += 1
|
||||
|
||||
print(pcounter)
|
||||
print(scounter)
|
||||
|
||||
print("P(S) = " + str(scounter+1/counter+2))
|
||||
print("P(P) = " + str(pcounter+1/counter+2))
|
||||
|
||||
def calc_class_logprob(expected_path):
|
||||
paranoarmal_class_count = 0
|
||||
skeptic_class_count = 0
|
||||
with open(expected_path) as f:
|
||||
@ -21,100 +43,29 @@ def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla
|
||||
return math.log(paranormal_class_prob), math.log(skeptic_class_prob)
|
||||
|
||||
def calc_word_counts(in_path, expected_path):
|
||||
with open(in_path) as in_file, open(expected_path) as exp_file:
|
||||
with open(in_path), open(expected_path) as in_file, exp_file:
|
||||
word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
|
||||
for in_line, exp_line in zip(in_file, exp_file):
|
||||
class_ = exp_line.rstrip('\n').replace(" ", "")
|
||||
text, timestamp = in_line.rstrip('\n').split('\t')
|
||||
tokens = text.lower().split(' ')
|
||||
for token in tokens:
|
||||
if class_ == 'P':
|
||||
word_counts['paranormal'][token] += 1
|
||||
elif class_ == 'S':
|
||||
word_counts['skeptic'][token] += 1
|
||||
for line in f:
|
||||
class_ = exp_line.rstrip('\n').replace(" ", "")
|
||||
text, timestamp = line.rstrip('\n').split('\t')
|
||||
tokens = text.lower().split(' ')
|
||||
for token in tokens:
|
||||
if class_ == 'P':
|
||||
word_counts['paranormal'][token] += 1
|
||||
elif class_ == 'S':
|
||||
word_counts['skeptic'][token] += 1
|
||||
|
||||
return word_counts
|
||||
|
||||
|
||||
def calc_word_logprobs(word_counts):
|
||||
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())
|
||||
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
|
||||
word_logprobs = {'paranormal': {}, 'skeptic':{}}
|
||||
for class_ in word_logprobs.keys():
|
||||
for token, value in word_counts[class_].items():
|
||||
if class_ == 'skeptic':
|
||||
word_prob = (value + 1)/ total_skeptic
|
||||
else:
|
||||
word_prob = (value + 1)/total_paranormal
|
||||
word_logprobs[class_][token] = math.log(word_prob)
|
||||
return word_logprobs
|
||||
|
||||
paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")
|
||||
|
||||
word_counts = calc_word_counts('train/in.tsv','train/expected.tsv')
|
||||
|
||||
word_logprobs = calc_word_logprobs(word_counts)
|
||||
|
||||
print(word_logprobs['skeptic']["hair."]) #-12.166205308815476
|
||||
|
||||
#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic
|
||||
|
||||
def get_test_posts(path):
|
||||
posts = []
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
text, timestamp = line.rstrip('\n').split('\t')
|
||||
posts.append(text)
|
||||
return posts
|
||||
def calc_words_logprobs(words_counts):
|
||||
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()))
|
||||
total_paranormal = sum(word_counts['paranormal'].values() + len(word_counts['paranormal'].keys()))
|
||||
|
||||
|
||||
def predict_post_class(posts, sprob, pprob, word_logprobs):
|
||||
out_classes = []
|
||||
with open('prediction.tsv', 'wt') as tsvfile:
|
||||
tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
||||
for i in range(counter):
|
||||
tsv_writer.writerow('S')
|
||||
|
||||
for post in posts:
|
||||
total_s_prob = sprob
|
||||
total_p_prob = pprob
|
||||
tokens = post.lower().split(' ')
|
||||
for token in tokens:
|
||||
#dlasceptic
|
||||
if (token in word_logprobs['skeptic'].keys()):
|
||||
sceptic_prob = word_logprobs['skeptic'][token]
|
||||
else:
|
||||
sceptic_prob = 0
|
||||
#dlaparanormal
|
||||
if (token in word_logprobs['paranormal'].keys()):
|
||||
paranormal_prob = word_logprobs['paranormal'][token]
|
||||
else:
|
||||
paranormal_prob = 0
|
||||
total_s_prob += sceptic_prob
|
||||
total_p_prob += paranormal_prob
|
||||
|
||||
#print(total_p_prob)
|
||||
#print(total_s_prob)
|
||||
if total_p_prob > total_s_prob:
|
||||
out_classes.append('P')
|
||||
else:
|
||||
out_classes.append('S')
|
||||
|
||||
return out_classes
|
||||
|
||||
|
||||
def predict_posts(path):
|
||||
posts = get_test_posts(path+'/in.tsv')
|
||||
classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs)
|
||||
with open(path+"/out.tsv", 'wt') as tsvfile:
|
||||
tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
||||
for i in classes:
|
||||
tsv_writer.writerow(i)
|
||||
|
||||
predict_posts("dev-0")
|
||||
predict_posts("test-A")
|
||||
|
||||
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
|
||||
counter = 0
|
||||
positive = 0
|
||||
for out_line, exp_line in zip(out_file, exp_file):
|
||||
counter+=1
|
||||
if " "+out_line == exp_line:
|
||||
positive += 1
|
||||
print(positive/counter)
|
@ -1,17 +1,22 @@
|
||||
import csv
|
||||
|
||||
import re
|
||||
|
||||
def makeoutput(infile, outfile):
|
||||
counter = 0
|
||||
output = []
|
||||
regex = r'paranormal|ufo|youtube|spirit'
|
||||
with open(infile) as tsvfile:
|
||||
reader = csv.reader(tsvfile, delimiter='\t')
|
||||
for row in reader:
|
||||
counter+=1
|
||||
if re.search(regex, str(row).lower()):
|
||||
output.append('1')
|
||||
else:
|
||||
output.append('0')
|
||||
|
||||
|
||||
with open(outfile, 'wt') as tsvfile:
|
||||
tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
||||
for i in range(counter):
|
||||
tsv_writer.writerow('S')
|
||||
for i in output:
|
||||
tsv_writer.writerow(i)
|
||||
|
||||
makeoutput("test-A/in.tsv", "test-A/out.tsv")
|
||||
makeoutput("train/in.tsv", "train/out.tsv")
|
||||
|
650
test-A/out.tsv
650
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
579158
train/expected.tsv
579158
train/expected.tsv
File diff suppressed because it is too large
Load Diff
579082
train/out.tsv
579082
train/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user