Linear regression 1
This commit is contained in:
commit
d6158fa514
@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
|
|||||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||||
,Glitch-in-the-Matrix, conspiracytheories).
|
,Glitch-in-the-Matrix, conspiracytheories).
|
||||||
|
|
||||||
Output label is `S` and `P`.
|
Output label is 0 (for skeptic) and 1 (for paranormal).
|
||||||
|
|
||||||
Sources
|
Sources
|
||||||
-------
|
-------
|
||||||
|
10544
dev-0/expected.tsv
10544
dev-0/expected.tsv
File diff suppressed because it is too large
Load Diff
8352
dev-0/out.tsv
8352
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -20,7 +20,7 @@ for line in sys.stdin:
|
|||||||
y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))
|
y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))
|
||||||
|
|
||||||
|
|
||||||
if y_predicted <= 0.5:
|
if y_predicted <= 0.63:
|
||||||
print(0)
|
print(0)
|
||||||
else:
|
else:
|
||||||
print(1)
|
print(1)
|
||||||
|
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
12
tokenizer.py
12
tokenizer.py
@ -1,11 +1,21 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
|
from nltk.corpus import stopwords
|
||||||
import nltk
|
import nltk
|
||||||
import re
|
import re
|
||||||
import string
|
import string
|
||||||
|
|
||||||
|
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
printable = set(string.printable)
|
||||||
|
|
||||||
def tokenize(d):
|
def tokenize(d):
|
||||||
|
d = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'thereisasimplelinkinside', d, flags=re.MULTILINE)
|
||||||
d = re.sub(r'\\n',' ',d)
|
d = re.sub(r'\\n',' ',d)
|
||||||
words = word_tokenize(d)
|
d = re.sub(r'\*|\'|\"|\/|~|_|=|-',' ',d)
|
||||||
|
d = ''.join(filter(lambda x: x in printable, d))
|
||||||
|
tokenized = word_tokenize(d)
|
||||||
|
lower = [w.lower() for w in tokenized]
|
||||||
|
words = [w for w in lower if not w in stop_words]
|
||||||
return words
|
return words
|
||||||
|
6
train.py
6
train.py
@ -75,8 +75,8 @@ def train():
|
|||||||
Loss_sum += Loss
|
Loss_sum += Loss
|
||||||
#We will stop after loss reach some value
|
#We will stop after loss reach some value
|
||||||
|
|
||||||
if Loss_sum_counter % 1000 == 0:
|
if Loss_sum_counter % 10000 == 0:
|
||||||
print(Loss_sum / 1000)
|
print(Loss_sum / 10000)
|
||||||
Loss_sum = 0.0
|
Loss_sum = 0.0
|
||||||
Loss_sum_counter += 1
|
Loss_sum_counter += 1
|
||||||
|
|
||||||
@ -87,7 +87,7 @@ def train():
|
|||||||
if word in word_to_index_mapping:
|
if word in word_to_index_mapping:
|
||||||
weights[word_to_index_mapping[word]] -= ((word_count[word] / len(word_count)) * delta)
|
weights[word_to_index_mapping[word]] -= ((word_count[word] / len(word_count)) * delta)
|
||||||
|
|
||||||
if Loss_sum_counter > 1000000:
|
if Loss_sum_counter > 50000000:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
|
579158
train/expected.tsv
579158
train/expected.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user