Linear regression 1

2020-04-04 19:02:51 +02:00 · 2020-04-04 19:02:51 +02:00 · d6158fa514
commit d6158fa514
parent 14432fab2d abba594b01
8 changed files with 304207 additions and 299045 deletions
--- a/README.md
+++ b/README.md
@ -1,13 +1,13 @@
-Skeptic vs paranormal subreddits
-================================
-
-Classify a reddit as either from Skeptic subreddit or one of the
-"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
-,Glitch-in-the-Matrix, conspiracytheories).
-
-Output label is `S` and `P`.
-
-Sources
-------
-
-Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
+Skeptic vs paranormal subreddits
+================================
+
+Classify a reddit as either from Skeptic subreddit or one of the
+"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
+,Glitch-in-the-Matrix, conspiracytheories).
+
+Output label is 0 (for skeptic) and  1 (for paranormal).
+
+Sources
+-------
+
+Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/predict.py
+++ b/predict.py
@ -20,7 +20,7 @@ for line in sys.stdin:
        y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))


-    if y_predicted <= 0.5:
+    if y_predicted <= 0.63:
        print(0)
    else:
        print(1)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/tokenizer.py
+++ b/tokenizer.py
@ -1,11 +1,21 @@
 #!/usr/bin/python3

 from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
 import nltk
 import re
 import string

+
+stop_words = set(stopwords.words('english'))
+printable = set(string.printable)
+
 def tokenize(d):
+    d = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'thereisasimplelinkinside', d, flags=re.MULTILINE)
    d = re.sub(r'\\n',' ',d)
-    words = word_tokenize(d)
+    d = re.sub(r'\*|\'|\"|\/|~|_|=|-',' ',d)
+    d = ''.join(filter(lambda x: x in printable, d))
+    tokenized = word_tokenize(d)
+    lower = [w.lower() for w in tokenized]
+    words = [w for w in lower if not w in stop_words]
    return words
--- a/train.py
+++ b/train.py
@ -75,8 +75,8 @@ def train():
        Loss_sum += Loss
        #We will stop after loss reach some value

-        if Loss_sum_counter % 1000 == 0:
-            print(Loss_sum / 1000)
+        if Loss_sum_counter % 10000 == 0:
+            print(Loss_sum / 10000)
            Loss_sum = 0.0
        Loss_sum_counter += 1

@ -87,7 +87,7 @@ def train():
            if word in word_to_index_mapping:
                weights[word_to_index_mapping[word]] -= ((word_count[word] / len(word_count)) * delta)

-        if Loss_sum_counter > 1000000:
+        if Loss_sum_counter > 50000000:
            break
        

--- a/train/expected.tsv
+++ b/train/expected.tsv