Compare commits

...

7 Commits

Author SHA1 Message Date
Th3NiKo db398db388 Linear regression higher F1.0 lower accuracy 2020-04-06 14:01:32 +02:00
Th3NiKo d7040c9bc6 Linear regression first try 2020-04-06 13:07:14 +02:00
Th3NiKo d6158fa514 Linear regression 1 2020-04-04 19:02:51 +02:00
Th3NiKo 14432fab2d Linear try 2020-04-02 15:45:53 +02:00
Filip Gralinski abba594b01 Update README.md 2020-03-30 18:29:13 +02:00
Filip Gralinski 73a1b8862f Switching to O/1 2020-03-30 18:28:23 +02:00
Filip Gralinski f17f86149c Fix unwanted spaces 2020-03-30 12:30:04 +02:00
8 changed files with 305449 additions and 294864 deletions

View File

@ -1,13 +1,13 @@
Skeptic vs paranormal subreddits
================================
Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories).
Output label is `S` and `P`.
Sources
-------
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
Skeptic vs paranormal subreddits
================================
Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories).
Output label is 0 (for skeptic) and 1 (for paranormal).
Sources
-------
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.

File diff suppressed because it is too large Load Diff

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

30
predict.py Normal file
View File

@ -0,0 +1,30 @@
#!/usr/bin/python3
import sys
import pickle
from math import log, exp
from tokenizer import tokenize
#Load model
model = pickle.load(open("model.pkl","rb"))
weights, word_to_index_mapping, word_count = model
sum = 0
counter = 0
for line in sys.stdin:
document = line.rstrip()
fields = document.split('\t')
document = fields[0]
terms = tokenize(document)
y_predicted = weights[0]
for word in terms:
y_predicted += weights[word_to_index_mapping.get(word,0)] * log(word_count.get(word,0) / len(word_count) + 1)
sum += y_predicted
counter += 1
if y_predicted <= 0:
print(0)
else:
print(1)
#print(sum / counter)

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

21
tokenizer.py Normal file
View File

@ -0,0 +1,21 @@
#!/usr/bin/python3
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import re
import string
stop_words = set(stopwords.words('english'))
printable = set(string.printable)
def tokenize(d):
d = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'thereisasimplelinkinside', d, flags=re.MULTILINE)
d = re.sub(r'\\n',' ',d)
d = re.sub(r'\*|\'|\"|\/|~|_|=|-',' ',d)
d = ''.join(filter(lambda x: x in printable, d))
tokenized = word_tokenize(d)
lower = [w.lower() for w in tokenized]
words = [w for w in lower if not w in stop_words]
return words

110
train.py Normal file
View File

@ -0,0 +1,110 @@
#!/usr/bin/python3
'''
Linear regression for paranormal and sceptic challange 2.0.0
In order to use train.py you need to pass two columns
label document
splited by \t
Commands used: xzcat, paste
'''
import sys
import pickle
import random
from math import log, exp
import collections
from tokenizer import tokenize
def train():
#Prepare
vocabulary = set()
word_to_index_mapping = {}
index_to_word_mapping = {}
word_count = collections.defaultdict(int)
#Array x,y to use later for training process
x = []
y = []
learning_rate = 0.000001
#Read values from file
for line in sys.stdin:
line = line.rstrip()
fields = line.split('\t')
label = fields[0]
document = fields[1]
terms = tokenize(document)
#Add words from document to x and label to y (we need to reconfigure label p is 1 and s is 0)
x.append(terms)
if label == "P":
y.append(1)
else:
y.append(0)
#Update vocabulary and count how often word appear
for t in terms:
word_count[t] += 1
vocabulary.add(t)
#Give numbers for words. Each word its own value. Indexing
ix = 1
for w in vocabulary:
word_to_index_mapping[w] = ix
index_to_word_mapping[ix] = w
ix += 1
#Initialize weights with random values from -1.0 to 1.0 (floats)
weights = []
for ix in range(0,len(vocabulary) + 1):
weights.append(random.uniform(-1.00, 1.00))
Loss_sum = 0.0
Loss_sum_counter = 1
while True:
choose_random_example = random.randint(0,len(x)-1)
actual_x = x[choose_random_example] #list of words
actual_y = y[choose_random_example] #label for this set of words
#Predict result
y_predicted = weights[0]
#Iterate over all words in randomly choosen example
#With get u can avoid missing words and replace them with value u want
#Weights replace value doesnt matter if word is missing cause word_count will give 0
for word in actual_x:
y_predicted += weights[word_to_index_mapping.get(word,0)] * log(word_count.get(word,0) / len(word_count) + 1)
#Cost count. Check how good was our prediction
Loss = (y_predicted - actual_y) ** 2.0
#We sum loss to get average value. It will be easier for us to follow
Loss_sum += Loss
#We will stop after loss reach some value
if Loss_sum_counter % 10000 == 0:
print(str(Loss_sum_counter) + " " + str(Loss_sum / 10000))
Loss_sum = 0.0
Loss_sum_counter += 1
#Update weights
delta = (y_predicted - actual_y) * learning_rate
weights[0] = weights[0] - delta
for word in actual_x:
if word in word_to_index_mapping:
weights[word_to_index_mapping[word]] -= (log(word_count[word] / len(word_count) + 1) * delta)
if Loss_sum_counter > 7000000:
break
#We save only things we need for prediction
model = (weights, word_to_index_mapping, word_count)
pickle.dump(model, open("model.pkl", "wb"))
train()

File diff suppressed because it is too large Load Diff