Compare commits
7 Commits
Author | SHA1 | Date |
---|---|---|
Th3NiKo | db398db388 | |
Th3NiKo | d7040c9bc6 | |
Th3NiKo | d6158fa514 | |
Th3NiKo | 14432fab2d | |
Filip Gralinski | abba594b01 | |
Filip Gralinski | 73a1b8862f | |
Filip Gralinski | f17f86149c |
26
README.md
26
README.md
|
@ -1,13 +1,13 @@
|
|||
Skeptic vs paranormal subreddits
|
||||
================================
|
||||
|
||||
Classify a reddit as either from Skeptic subreddit or one of the
|
||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||
,Glitch-in-the-Matrix, conspiracytheories).
|
||||
|
||||
Output label is `S` and `P`.
|
||||
|
||||
Sources
|
||||
-------
|
||||
|
||||
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
|
||||
Skeptic vs paranormal subreddits
|
||||
================================
|
||||
|
||||
Classify a reddit as either from Skeptic subreddit or one of the
|
||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||
,Glitch-in-the-Matrix, conspiracytheories).
|
||||
|
||||
Output label is 0 (for skeptic) and 1 (for paranormal).
|
||||
|
||||
Sources
|
||||
-------
|
||||
|
||||
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
|
||||
|
|
10544
dev-0/expected.tsv
10544
dev-0/expected.tsv
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,30 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import sys
|
||||
import pickle
|
||||
from math import log, exp
|
||||
from tokenizer import tokenize
|
||||
|
||||
#Load model
|
||||
model = pickle.load(open("model.pkl","rb"))
|
||||
weights, word_to_index_mapping, word_count = model
|
||||
sum = 0
|
||||
counter = 0
|
||||
|
||||
for line in sys.stdin:
|
||||
document = line.rstrip()
|
||||
fields = document.split('\t')
|
||||
document = fields[0]
|
||||
terms = tokenize(document)
|
||||
|
||||
y_predicted = weights[0]
|
||||
for word in terms:
|
||||
y_predicted += weights[word_to_index_mapping.get(word,0)] * log(word_count.get(word,0) / len(word_count) + 1)
|
||||
sum += y_predicted
|
||||
counter += 1
|
||||
if y_predicted <= 0:
|
||||
print(0)
|
||||
else:
|
||||
print(1)
|
||||
|
||||
#print(sum / counter)
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,21 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
import nltk
|
||||
import re
|
||||
import string
|
||||
|
||||
|
||||
stop_words = set(stopwords.words('english'))
|
||||
printable = set(string.printable)
|
||||
|
||||
def tokenize(d):
|
||||
d = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'thereisasimplelinkinside', d, flags=re.MULTILINE)
|
||||
d = re.sub(r'\\n',' ',d)
|
||||
d = re.sub(r'\*|\'|\"|\/|~|_|=|-',' ',d)
|
||||
d = ''.join(filter(lambda x: x in printable, d))
|
||||
tokenized = word_tokenize(d)
|
||||
lower = [w.lower() for w in tokenized]
|
||||
words = [w for w in lower if not w in stop_words]
|
||||
return words
|
|
@ -0,0 +1,110 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
'''
|
||||
Linear regression for paranormal and sceptic challange 2.0.0
|
||||
In order to use train.py you need to pass two columns
|
||||
label document
|
||||
splited by \t
|
||||
Commands used: xzcat, paste
|
||||
'''
|
||||
|
||||
import sys
|
||||
import pickle
|
||||
import random
|
||||
from math import log, exp
|
||||
import collections
|
||||
from tokenizer import tokenize
|
||||
|
||||
|
||||
def train():
|
||||
#Prepare
|
||||
vocabulary = set()
|
||||
word_to_index_mapping = {}
|
||||
index_to_word_mapping = {}
|
||||
word_count = collections.defaultdict(int)
|
||||
|
||||
#Array x,y to use later for training process
|
||||
x = []
|
||||
y = []
|
||||
|
||||
learning_rate = 0.000001
|
||||
|
||||
#Read values from file
|
||||
for line in sys.stdin:
|
||||
line = line.rstrip()
|
||||
fields = line.split('\t')
|
||||
label = fields[0]
|
||||
document = fields[1]
|
||||
terms = tokenize(document)
|
||||
|
||||
#Add words from document to x and label to y (we need to reconfigure label p is 1 and s is 0)
|
||||
x.append(terms)
|
||||
if label == "P":
|
||||
y.append(1)
|
||||
else:
|
||||
y.append(0)
|
||||
|
||||
#Update vocabulary and count how often word appear
|
||||
for t in terms:
|
||||
word_count[t] += 1
|
||||
vocabulary.add(t)
|
||||
|
||||
#Give numbers for words. Each word its own value. Indexing
|
||||
ix = 1
|
||||
for w in vocabulary:
|
||||
word_to_index_mapping[w] = ix
|
||||
index_to_word_mapping[ix] = w
|
||||
ix += 1
|
||||
|
||||
#Initialize weights with random values from -1.0 to 1.0 (floats)
|
||||
weights = []
|
||||
for ix in range(0,len(vocabulary) + 1):
|
||||
weights.append(random.uniform(-1.00, 1.00))
|
||||
|
||||
Loss_sum = 0.0
|
||||
Loss_sum_counter = 1
|
||||
|
||||
while True:
|
||||
choose_random_example = random.randint(0,len(x)-1)
|
||||
actual_x = x[choose_random_example] #list of words
|
||||
actual_y = y[choose_random_example] #label for this set of words
|
||||
|
||||
#Predict result
|
||||
y_predicted = weights[0]
|
||||
|
||||
#Iterate over all words in randomly choosen example
|
||||
#With get u can avoid missing words and replace them with value u want
|
||||
#Weights replace value doesnt matter if word is missing cause word_count will give 0
|
||||
for word in actual_x:
|
||||
y_predicted += weights[word_to_index_mapping.get(word,0)] * log(word_count.get(word,0) / len(word_count) + 1)
|
||||
|
||||
#Cost count. Check how good was our prediction
|
||||
Loss = (y_predicted - actual_y) ** 2.0
|
||||
#We sum loss to get average value. It will be easier for us to follow
|
||||
Loss_sum += Loss
|
||||
#We will stop after loss reach some value
|
||||
|
||||
if Loss_sum_counter % 10000 == 0:
|
||||
print(str(Loss_sum_counter) + " " + str(Loss_sum / 10000))
|
||||
Loss_sum = 0.0
|
||||
Loss_sum_counter += 1
|
||||
|
||||
#Update weights
|
||||
delta = (y_predicted - actual_y) * learning_rate
|
||||
weights[0] = weights[0] - delta
|
||||
for word in actual_x:
|
||||
if word in word_to_index_mapping:
|
||||
weights[word_to_index_mapping[word]] -= (log(word_count[word] / len(word_count) + 1) * delta)
|
||||
|
||||
if Loss_sum_counter > 7000000:
|
||||
break
|
||||
|
||||
|
||||
|
||||
|
||||
#We save only things we need for prediction
|
||||
model = (weights, word_to_index_mapping, word_count)
|
||||
pickle.dump(model, open("model.pkl", "wb"))
|
||||
|
||||
|
||||
train()
|
579158
train/expected.tsv
579158
train/expected.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue