Linear regression 1
This commit is contained in:
commit
d6158fa514
26
README.md
26
README.md
@ -1,13 +1,13 @@
|
||||
Skeptic vs paranormal subreddits
|
||||
================================
|
||||
|
||||
Classify a reddit as either from Skeptic subreddit or one of the
|
||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||
,Glitch-in-the-Matrix, conspiracytheories).
|
||||
|
||||
Output label is `S` and `P`.
|
||||
|
||||
Sources
|
||||
-------
|
||||
|
||||
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
|
||||
Skeptic vs paranormal subreddits
|
||||
================================
|
||||
|
||||
Classify a reddit as either from Skeptic subreddit or one of the
|
||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||
,Glitch-in-the-Matrix, conspiracytheories).
|
||||
|
||||
Output label is 0 (for skeptic) and 1 (for paranormal).
|
||||
|
||||
Sources
|
||||
-------
|
||||
|
||||
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
|
||||
|
10544
dev-0/expected.tsv
10544
dev-0/expected.tsv
File diff suppressed because it is too large
Load Diff
8352
dev-0/out.tsv
8352
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -20,7 +20,7 @@ for line in sys.stdin:
|
||||
y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))
|
||||
|
||||
|
||||
if y_predicted <= 0.5:
|
||||
if y_predicted <= 0.63:
|
||||
print(0)
|
||||
else:
|
||||
print(1)
|
||||
|
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
12
tokenizer.py
12
tokenizer.py
@ -1,11 +1,21 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
import nltk
|
||||
import re
|
||||
import string
|
||||
|
||||
|
||||
stop_words = set(stopwords.words('english'))
|
||||
printable = set(string.printable)
|
||||
|
||||
def tokenize(d):
|
||||
d = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'thereisasimplelinkinside', d, flags=re.MULTILINE)
|
||||
d = re.sub(r'\\n',' ',d)
|
||||
words = word_tokenize(d)
|
||||
d = re.sub(r'\*|\'|\"|\/|~|_|=|-',' ',d)
|
||||
d = ''.join(filter(lambda x: x in printable, d))
|
||||
tokenized = word_tokenize(d)
|
||||
lower = [w.lower() for w in tokenized]
|
||||
words = [w for w in lower if not w in stop_words]
|
||||
return words
|
||||
|
6
train.py
6
train.py
@ -75,8 +75,8 @@ def train():
|
||||
Loss_sum += Loss
|
||||
#We will stop after loss reach some value
|
||||
|
||||
if Loss_sum_counter % 1000 == 0:
|
||||
print(Loss_sum / 1000)
|
||||
if Loss_sum_counter % 10000 == 0:
|
||||
print(Loss_sum / 10000)
|
||||
Loss_sum = 0.0
|
||||
Loss_sum_counter += 1
|
||||
|
||||
@ -87,7 +87,7 @@ def train():
|
||||
if word in word_to_index_mapping:
|
||||
weights[word_to_index_mapping[word]] -= ((word_count[word] / len(word_count)) * delta)
|
||||
|
||||
if Loss_sum_counter > 1000000:
|
||||
if Loss_sum_counter > 50000000:
|
||||
break
|
||||
|
||||
|
||||
|
579158
train/expected.tsv
579158
train/expected.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user