Updated basline
This commit is contained in:
commit
eb6ba923a4
13
README.md
Normal file
13
README.md
Normal file
@ -0,0 +1,13 @@
|
||||
Skeptic vs paranormal subreddits
|
||||
================================
|
||||
|
||||
Classify a reddit as either from Skeptic subreddit or one of the
|
||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||
,Glitch-in-the-Matrix, conspiracytheories).
|
||||
|
||||
Output label is `S` and `P`.
|
||||
|
||||
Sources
|
||||
-------
|
||||
|
||||
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
||||
--metric Accuracy --precision 4 --in-header in-header.tsv --out-header out-header.tsv
|
5272
dev-0/expected.tsv
Normal file
5272
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
dev-0/in.tsv.xz
Normal file
BIN
dev-0/in.tsv.xz
Normal file
Binary file not shown.
1
in-header.tsv
Normal file
1
in-header.tsv
Normal file
@ -0,0 +1 @@
|
||||
PostText Timestamp
|
|
BIN
naive_base_model.pkl
Normal file
BIN
naive_base_model.pkl
Normal file
Binary file not shown.
1
out-header.tsv
Normal file
1
out-header.tsv
Normal file
@ -0,0 +1 @@
|
||||
Label
|
|
48
predict.py
Executable file
48
predict.py
Executable file
@ -0,0 +1,48 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import pickle
|
||||
import math
|
||||
|
||||
def clear_tokens(tokens):
|
||||
tokens = tokens.replace('\n', ' ')
|
||||
|
||||
return tokens
|
||||
|
||||
def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
|
||||
# dla kazdego tokenu z danego posta
|
||||
text, timestap = post.rstrip('\n').split('\t')
|
||||
text = clear_tokens(text)
|
||||
tokens = text.lower().split(' ')
|
||||
probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
|
||||
for class_ in word_logprobs.keys():
|
||||
product = 1
|
||||
for token in tokens:
|
||||
try:
|
||||
product *= word_logprobs[class_][token]
|
||||
except KeyError:
|
||||
pass
|
||||
# tu wzoru uzyj
|
||||
if class_ == 'sceptic':
|
||||
product *= sceptic_class_logprob
|
||||
elif class_ == 'paranormal':
|
||||
product *= paranormal_class_logprob
|
||||
probs[abs(product)] = class_
|
||||
print(probs)
|
||||
|
||||
return probs[max(probs.keys())]
|
||||
|
||||
|
||||
def main():
|
||||
with open('naive_base_model.pkl', 'rb') as f:
|
||||
pickle_list = pickle.load(f)
|
||||
paranormal_class_logprob = pickle_list[0]
|
||||
sceptic_class_logprob = pickle_list[1]
|
||||
word_logprobs = pickle_list[2]
|
||||
with open('test-A/in.tsv') as in_f, open('test-A/out.tsv', 'w') as out_f:
|
||||
for line in in_f:
|
||||
hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs)
|
||||
if hyp == 'sceptic':
|
||||
out_f.write(" S\n")
|
||||
elif hyp == 'paranormal':
|
||||
out_f.write(' P\n')
|
||||
main()
|
5152
test-A/in.tsv
Normal file
5152
test-A/in.tsv
Normal file
File diff suppressed because one or more lines are too long
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
72
train.py
Executable file
72
train.py
Executable file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/python3
|
||||
from collections import defaultdict
|
||||
import math
|
||||
import pickle
|
||||
|
||||
# in expected.tsv
|
||||
def calc_class_logprob(expected_path):
|
||||
paranolal_classcount=0
|
||||
sceptic_classcount=0
|
||||
with open(expected_path) as f:
|
||||
for line in f:
|
||||
if 'P' in line:
|
||||
paranolal_classcount +=1
|
||||
elif 'S' in line:
|
||||
sceptic_classcount +=1
|
||||
|
||||
paranol_prob = paranolal_classcount / (paranolal_classcount + sceptic_classcount)
|
||||
sceptic_prob = sceptic_classcount / (paranolal_classcount + sceptic_classcount)
|
||||
|
||||
return math.log(paranol_prob), math.log(sceptic_prob)
|
||||
|
||||
def clear_tokens(tokens):
|
||||
tokens = tokens.replace('\n', ' ')
|
||||
# delete links, special characters, kropki, and \n
|
||||
|
||||
return tokens
|
||||
|
||||
# ile razy slowo wystepuje w dokumentach w danej klasie
|
||||
def calc_word_count(in_path, expected_path):
|
||||
word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
|
||||
with open(in_path) as infile, open(expected_path) as expectedfile:
|
||||
for line, exp in zip(infile, expectedfile):
|
||||
class_ = exp.rstrip('\n').replace(' ','')
|
||||
text, timestap =line.rstrip('\n').split('\t')
|
||||
#print(f"text {type(text)}")
|
||||
text = clear_tokens(text)
|
||||
tokens = text.lower().split(' ')
|
||||
#print(f"tokens {type(tokens)}")
|
||||
for token in tokens:
|
||||
if class_ == 'P':
|
||||
word_counts['paranormal'][token] += 1
|
||||
elif class_ == 'S':
|
||||
word_counts['sceptic'][token]+=1
|
||||
|
||||
return word_counts
|
||||
|
||||
def calc_word_logprobs(word_counts):
|
||||
total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
|
||||
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
|
||||
word_logprobs= {'paranormal': {}, 'sceptic': {}}
|
||||
for class_ in word_counts.keys(): # sceptic paranormal
|
||||
for token, value in word_counts[class_].items():
|
||||
if class_ == 'sceptic':
|
||||
word_prob = (value +1)/ total_skeptic
|
||||
elif class_ == 'paranormal':
|
||||
word_prob = (value+1)/ total_paranormal
|
||||
|
||||
print (token)
|
||||
word_logprobs[class_][token] = math.log(word_prob)
|
||||
|
||||
return word_logprobs
|
||||
|
||||
def main():
|
||||
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob('./train/expected.tsv')
|
||||
wordcounts =calc_word_count('./train/in.tsv','./train/expected.tsv')
|
||||
|
||||
word_logprobs = calc_word_logprobs(wordcounts)
|
||||
with open('naive_base_model.pkl', 'wb') as f:
|
||||
pickle.dump([paranormal_class_lgprob, skeptic_class_logprob, word_logprobs], f)
|
||||
# w predict.py bierzemy ten wzor argmax P(w) iloczynP(w|c)
|
||||
|
||||
main()
|
289579
train/expected.tsv
Normal file
289579
train/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
289579
train/expected.tsv_
Normal file
289579
train/expected.tsv_
Normal file
File diff suppressed because it is too large
Load Diff
289579
train/in.tsv
Normal file
289579
train/in.tsv
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user