Linear regression 1

This commit is contained in:
Th3NiKo 2020-04-04 19:02:51 +02:00
commit d6158fa514
8 changed files with 304207 additions and 299045 deletions

View File

@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts, "paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories). ,Glitch-in-the-Matrix, conspiracytheories).
Output label is `S` and `P`. Output label is 0 (for skeptic) and 1 (for paranormal).
Sources Sources
------- -------

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -20,7 +20,7 @@ for line in sys.stdin:
y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count)) y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))
if y_predicted <= 0.5: if y_predicted <= 0.63:
print(0) print(0)
else: else:
print(1) print(1)

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +1,21 @@
#!/usr/bin/python3 #!/usr/bin/python3
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk import nltk
import re import re
import string import string
stop_words = set(stopwords.words('english'))
printable = set(string.printable)
def tokenize(d): def tokenize(d):
d = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'thereisasimplelinkinside', d, flags=re.MULTILINE)
d = re.sub(r'\\n',' ',d) d = re.sub(r'\\n',' ',d)
words = word_tokenize(d) d = re.sub(r'\*|\'|\"|\/|~|_|=|-',' ',d)
d = ''.join(filter(lambda x: x in printable, d))
tokenized = word_tokenize(d)
lower = [w.lower() for w in tokenized]
words = [w for w in lower if not w in stop_words]
return words return words

View File

@ -75,8 +75,8 @@ def train():
Loss_sum += Loss Loss_sum += Loss
#We will stop after loss reach some value #We will stop after loss reach some value
if Loss_sum_counter % 1000 == 0: if Loss_sum_counter % 10000 == 0:
print(Loss_sum / 1000) print(Loss_sum / 10000)
Loss_sum = 0.0 Loss_sum = 0.0
Loss_sum_counter += 1 Loss_sum_counter += 1
@ -87,7 +87,7 @@ def train():
if word in word_to_index_mapping: if word in word_to_index_mapping:
weights[word_to_index_mapping[word]] -= ((word_count[word] / len(word_count)) * delta) weights[word_to_index_mapping[word]] -= ((word_count[word] / len(word_count)) * delta)
if Loss_sum_counter > 1000000: if Loss_sum_counter > 50000000:
break break

File diff suppressed because it is too large Load Diff