2020-04-02 18:29:06 +02:00
|
|
|
from collections import defaultdict
|
|
|
|
import math
|
|
|
|
import pickle
|
|
|
|
import re
|
|
|
|
|
2020-04-04 19:55:07 +02:00
|
|
|
from pip._vendor.msgpack.fallback import xrange
|
|
|
|
import random
|
|
|
|
|
2020-04-02 18:29:06 +02:00
|
|
|
vocabulary=[]
|
2020-04-04 19:55:07 +02:00
|
|
|
#word_to_index_mapping=[]
|
|
|
|
#index_to_word_mapping=[]
|
|
|
|
|
|
|
|
#file_to_save=open("test.tsv","w",encoding='utf-8')
|
|
|
|
#def define_vocabulary(file_to_learn_new_words,expected_path):
|
|
|
|
# word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
|
|
|
|
# with open(file_to_learn_new_words, encoding='utf-8') as in_file, open(expected_path, encoding='utf-8') as expected_file:
|
|
|
|
# for line, exp in zip(in_file, expected_file):
|
|
|
|
# class_ = exp.rstrip('\n').replace(' ', '')
|
|
|
|
# text, timestamp = line.rstrip('\n').split('\t')
|
|
|
|
# tokens = text.lower().split(' ')
|
|
|
|
# for token in tokens:
|
|
|
|
# if class_ == 'P':
|
|
|
|
# word_counts['paranormal'][token] += 1
|
|
|
|
# elif class_ == 'S':
|
|
|
|
# word_counts['skeptic'][token] += 1
|
|
|
|
# return word_counts
|
2020-04-02 18:29:06 +02:00
|
|
|
|
2020-04-02 20:01:33 +02:00
|
|
|
file_to_save=open("test.tsv","w",encoding='utf-8')
|
2020-04-04 19:55:07 +02:00
|
|
|
def define_vocabulary(file_to_learn_new_words):
|
|
|
|
word_counts={'count': defaultdict(int)}
|
|
|
|
with open(file_to_learn_new_words,encoding='utf-8') as in_file:
|
|
|
|
for line in in_file:
|
2020-04-02 20:01:33 +02:00
|
|
|
text, timestamp = line.rstrip('\n').split('\t')
|
|
|
|
tokens = text.lower().split(' ')
|
|
|
|
for token in tokens:
|
2020-04-04 19:55:07 +02:00
|
|
|
word_counts['count'][token]+=1
|
2020-04-02 20:01:33 +02:00
|
|
|
return word_counts
|
2020-04-02 18:29:06 +02:00
|
|
|
|
2020-04-05 00:34:05 +02:00
|
|
|
def read_input(file_path):
|
|
|
|
word_counts={'count': defaultdict(int)}
|
|
|
|
with open(file_path, encoding='utf-8') as in_file:
|
|
|
|
for line in in_file:
|
|
|
|
text, timestamp = line.rstrip('\n').split('\t')
|
|
|
|
tokens = text.lower().split(' ')
|
|
|
|
for token in tokens:
|
|
|
|
word_counts['count'][token]+=1
|
|
|
|
return word_counts
|
|
|
|
|
2020-04-02 18:29:06 +02:00
|
|
|
def main():
|
2020-04-04 19:55:07 +02:00
|
|
|
# --------------- initialization ---------------------------------
|
|
|
|
vocabulary = define_vocabulary('train/in.tsv')
|
|
|
|
i=1;
|
|
|
|
weights=[]
|
|
|
|
testFuckingPython=len(vocabulary['count'])+1
|
|
|
|
for i in range(testFuckingPython):
|
|
|
|
weights.append(random.randrange(0,len(vocabulary['count'])+1))
|
2020-04-05 00:34:05 +02:00
|
|
|
precision=0.00001
|
|
|
|
learning_rate=0.001
|
|
|
|
prev_step_size=1
|
|
|
|
max_iterations=len(vocabulary['count'])
|
|
|
|
current_iteration=0
|
|
|
|
readed_words=read_input("train/in.tsv")
|
2020-04-04 19:55:07 +02:00
|
|
|
# --------------- prediction -------------------------------------
|
2020-04-05 00:34:05 +02:00
|
|
|
#while (prev_step_size>precision and current_iteration<max_iterations):
|
2020-04-02 20:01:33 +02:00
|
|
|
|
2020-04-02 18:29:06 +02:00
|
|
|
|
|
|
|
main()
|
|
|
|
|