Updated basline

This commit is contained in:
s426135 2020-03-22 10:15:36 +01:00
commit eb6ba923a4
15 changed files with 884449 additions and 0 deletions

13
README.md Normal file
View File

@ -0,0 +1,13 @@
Skeptic vs paranormal subreddits
================================
Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories).
Output label is `S` and `P`.
Sources
-------
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.

1
config.txt Normal file
View File

@ -0,0 +1 @@
--metric Accuracy --precision 4 --in-header in-header.tsv --out-header out-header.tsv

5272
dev-0/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
dev-0/in.tsv.xz Normal file

Binary file not shown.

1
in-header.tsv Normal file
View File

@ -0,0 +1 @@
PostText Timestamp
1 PostText Timestamp

BIN
naive_base_model.pkl Normal file

Binary file not shown.

1
out-header.tsv Normal file
View File

@ -0,0 +1 @@
Label
1 Label

48
predict.py Executable file
View File

@ -0,0 +1,48 @@
#!/usr/bin/python3
import pickle
import math
def clear_tokens(tokens):
tokens = tokens.replace('\n', ' ')
return tokens
def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
# dla kazdego tokenu z danego posta
text, timestap = post.rstrip('\n').split('\t')
text = clear_tokens(text)
tokens = text.lower().split(' ')
probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
for class_ in word_logprobs.keys():
product = 1
for token in tokens:
try:
product *= word_logprobs[class_][token]
except KeyError:
pass
# tu wzoru uzyj
if class_ == 'sceptic':
product *= sceptic_class_logprob
elif class_ == 'paranormal':
product *= paranormal_class_logprob
probs[abs(product)] = class_
print(probs)
return probs[max(probs.keys())]
def main():
with open('naive_base_model.pkl', 'rb') as f:
pickle_list = pickle.load(f)
paranormal_class_logprob = pickle_list[0]
sceptic_class_logprob = pickle_list[1]
word_logprobs = pickle_list[2]
with open('test-A/in.tsv') as in_f, open('test-A/out.tsv', 'w') as out_f:
for line in in_f:
hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs)
if hyp == 'sceptic':
out_f.write(" S\n")
elif hyp == 'paranormal':
out_f.write(' P\n')
main()

5152
test-A/in.tsv Normal file

File diff suppressed because one or more lines are too long

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

72
train.py Executable file
View File

@ -0,0 +1,72 @@
#!/usr/bin/python3
from collections import defaultdict
import math
import pickle
# in expected.tsv
def calc_class_logprob(expected_path):
paranolal_classcount=0
sceptic_classcount=0
with open(expected_path) as f:
for line in f:
if 'P' in line:
paranolal_classcount +=1
elif 'S' in line:
sceptic_classcount +=1
paranol_prob = paranolal_classcount / (paranolal_classcount + sceptic_classcount)
sceptic_prob = sceptic_classcount / (paranolal_classcount + sceptic_classcount)
return math.log(paranol_prob), math.log(sceptic_prob)
def clear_tokens(tokens):
tokens = tokens.replace('\n', ' ')
# delete links, special characters, kropki, and \n
return tokens
# ile razy slowo wystepuje w dokumentach w danej klasie
def calc_word_count(in_path, expected_path):
word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
with open(in_path) as infile, open(expected_path) as expectedfile:
for line, exp in zip(infile, expectedfile):
class_ = exp.rstrip('\n').replace(' ','')
text, timestap =line.rstrip('\n').split('\t')
#print(f"text {type(text)}")
text = clear_tokens(text)
tokens = text.lower().split(' ')
#print(f"tokens {type(tokens)}")
for token in tokens:
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
word_counts['sceptic'][token]+=1
return word_counts
def calc_word_logprobs(word_counts):
total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
word_logprobs= {'paranormal': {}, 'sceptic': {}}
for class_ in word_counts.keys(): # sceptic paranormal
for token, value in word_counts[class_].items():
if class_ == 'sceptic':
word_prob = (value +1)/ total_skeptic
elif class_ == 'paranormal':
word_prob = (value+1)/ total_paranormal
print (token)
word_logprobs[class_][token] = math.log(word_prob)
return word_logprobs
def main():
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob('./train/expected.tsv')
wordcounts =calc_word_count('./train/in.tsv','./train/expected.tsv')
word_logprobs = calc_word_logprobs(wordcounts)
with open('naive_base_model.pkl', 'wb') as f:
pickle.dump([paranormal_class_lgprob, skeptic_class_logprob, word_logprobs], f)
# w predict.py bierzemy ten wzor argmax P(w) iloczynP(w|c)
main()

BIN
train.pyc Normal file

Binary file not shown.

289579
train/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

289579
train/expected.tsv_ Normal file

File diff suppressed because it is too large Load Diff

289579
train/in.tsv Normal file

File diff suppressed because one or more lines are too long