paranormal-or-skeptic/code_regression.py

import random
import re
from _collections import defaultdict

def define_vocabulary(file_to_learn_new_words):
     word_counts = {'count': defaultdict(int)}
     with open(file_to_learn_new_words, encoding='utf-8') as in_file:
         for line in in_file:
             text, timestamp = line.rstrip('\n').split('\t')
             tokens = text.lower().split(' ')
             for token in tokens:
                 word_counts['count'][token] += 1
     in_file.close()
     return word_counts

def tokenize_list(string_input):
    words=[]
    string=string_input.replace('\\n',' ')
    text=re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
    text = re.sub(r'\\n+', " ", text)
    text = re.sub(r'http\S+', " ", text)
    text = re.sub(r'\/[a-z]\/', " ", text)
    text = re.sub(r'[^a-z]', " ", text)
    text = re.sub(r'\s{2,}', " ", text)
    text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
    text = re.sub(r'^\s', "", text)
    string=''
    for word in text:
        string+=word
    words=re.split(';+|,+|\*+|\n+| +|\_+|\%+|\t+|\[+|\]+|\.+|\(+|\)+|\++|\\+|\/+|[0-9]+|\#+|\'+|\"+|\-+|\=+|\&+|\:+|\?+|\!+|\^+|\·+',string)
    regex=re.compile(r'http|^[a-zA-Z]$|org')
    filtered_values=[
        word
        for word in words if not regex.match(word)
    ]
    filtered_values[:] = (
        value.lower()
        for value in filtered_values if len(value)!=0
    )
    return filtered_values

def read_words(input_path):
    vocabulary = {'count':defaultdict(int)}
    index=0
    with open(input_path,encoding='utf-8') as infile:
        for line in infile:
            index+=1
            tokens = tokenize_list(line)
            for token in tokens:
                if token not in vocabulary:
                    vocabulary['vocabulary'][token]+=1
    infile.close()
    return vocabulary

def train(vocabulary,input_train,expected_train):
    learning_rate=0.00001
    learning_precision=0.00001
    words_vocabulary={}
    with open(input_train,encoding='utf-8') as input_file, open(expected_train,encoding='utf-8') as expected_file:
        for line, exp in zip(input_file,expected_file):
            words_vocabulary[line]=int(exp)
    weights={}
    weight={}
    delta=1
    iteration=0
    loss_sum=0.0
    error=10.0
    max_iteration=len(vocabulary)
    for i in vocabulary['count'].keys():
        weights[i]=random.uniform(-0.01,0.01)
    while delta>learning_precision and iteration<max_iteration:
        d,y = random.choice(list(words_vocabulary.items()))
        y_hat=0
        tokens=tokenize_list(d)
        for token in tokens:
            if token in vocabulary['count'].keys():
                y_hat += weights[token] * tokens.count(token)
        delta=(y_hat-y) * learning_rate
        for word in tokens:
            if word in words_vocabulary:
                weights[word] -= (tokens.count(word)) * delta
        loss = (y_hat - y)**2.0
        loss_sum += loss
        if iteration%1000 == 0:
            if (error>(loss_sum/1000)):
                weight=weights
                error=loss_sum/1000
            loss_sum=0.0
        iteration += 1
    input_file.close()
    expected_file.close()
    return weight, vocabulary

def prediction(input,output,weights,vocabulary):
    with open(input,encoding='utf-8') as input_file, open(output,'w+',encoding='utf-8') as output:
        for line in input_file:
            y_hat=0
            tokens=tokenize_list(line)
            for token in tokens:
                if token in vocabulary['count'].keys():
                    y_hat += weights[token] * (token.count(token))
            if y_hat>0.0:
                output.write('1\n')
            else:
                output.write('0\n')
    output.close()
    input_file.close()

def main():
    vocabulary=define_vocabulary('train/in.tsv');
    weights, words = train(vocabulary,'train/in.tsv','train/expected.tsv')
    prediction('dev-0/in.tsv','dev-0/out.tsv',weights,words)
    prediction('test-A/in.tsv','test-A/out.tsv',weights,words)

main()


# from collections import defaultdict
# import math
# import pickle
# import re
#
# from pip._vendor.msgpack.fallback import xrange
# import random
#
# vocabulary = []
#
# file_to_save = open("test.tsv", "w", encoding='utf-8')
#
#
# def define_vocabulary(file_to_learn_new_words):
#     word_counts = {'count': defaultdict(int)}
#     with open(file_to_learn_new_words, encoding='utf-8') as in_file:
#         for line in in_file:
#             text, timestamp = line.rstrip('\n').split('\t')
#             tokens = text.lower().split(' ')
#             for token in tokens:
#                 word_counts['count'][token] += 1
#     return word_counts
#
#
# def read_input(file_path):
#     read_word_counts = {'count': defaultdict(int)}
#     with open(file_path, encoding='utf-8') as in_file:
#         for line in in_file:
#             text, timestamp = line.rstrip('\n').split('\t')
#             tokens = text.lower().split(' ')
#             for token in tokens:
#                 read_word_counts['count'][token] += 1
#     return read_word_counts
#
#
# def training(vocabulary, read_input, expected):
#     file_to_write = open(expected, 'w+', encoding='utf8')
#     file_to_write2 = open('out_y_hat.tsv', 'w+', encoding='utf8')
#     learning_rate = 0.00001
#     learning_precision = 0.0001
#     weights = []
#     iteration = 0
#     loss_sum = 0.0
#     ix = 1
#     readed_words_values = []
#     for word in read_input['count']:
#         if word not in vocabulary['count']:
#             read_input['count'][word] = 0
#         readed_words_values.append(read_input['count'][word])
#     for ix in range(0, len(vocabulary['count']) + 1):
#         weights.append(random.uniform(-0.001, 0.001))
#     # max_iteration=len(vocabulary['count'])+1
#     max_iteration = 10000
#     delta = 1
#     while delta>learning_precision and iteration<max_iteration:
#         d, y = random.choice(list(read_input['count'].items()))  # d-word, y-value of
#         y_hat = weights[0]
#         i = 0
#         for word_d in d:
#             if word_d in vocabulary['count'].keys():
#                 # print(vocabulary['count'][d])
#                 y_hat += weights[vocabulary['count'][word_d]] * readed_words_values[i]
#                 i += 1
#             print(f'Y_hat: {y_hat}')
#             file_to_write2.write(f'Y_hat: {y_hat}\n')
#             if y_hat > 0.5:
#                 file_to_write.write('1\n')
#             else:
#                 file_to_write.write('0\n')
#         i = 0
#         delta = (y_hat - y) * learning_rate
#         weights[0] = weights[0] - delta
#         for word_w in d:
#             if word_w in vocabulary['count'].keys():
#                 weights[vocabulary['count'][word_w]] -= readed_words_values[i] * delta
#                 i += 1
#         # print(weights)
#         #print(f'Y: {y}')
#         loss = (y_hat - y) ** 2.0
#         # loss=(y_hat-y)*(y_hat-y)
#         loss_sum += loss
#         if (iteration % 1000 == 0):
#             #print(loss_sum / 1000)
#             iteration = 0
#             loss_sum = 0.0
#         iteration += 1
#         file_to_write.close
#
# def main():
#     vocabulary = define_vocabulary('train/in.tsv')
#     readed_words = read_input('dev-0/in.tsv')
#     readed_words_test_a = read_input('test-A/in.tsv/in.tsv')
#     training(vocabulary, readed_words, 'dev-0/out.tsv')
#     training(vocabulary, readed_words_test_a, 'test-A/out.tsv')
#
#
# # def cost_function(y_hat,y):
# #    loss=(y_hat-y)**2.0
# #    loss_sum+=loss
# #    if loss_counter%1000==0:
# #        print(loss_sum/1000)
# #        loss_counter=0
# #        loss_sum=0.0
#
#
# # def main():
# # --------------- initialization ---------------------------------
# #    vocabulary = define_vocabulary('train/in.tsv')
# #    readed_words=read_input('dev-0/in.tsv')
# #    i=1;
# #    weights=[]
# #    readed_words_values=[]
# #    rangeVocabulary=len(vocabulary['count'])+1
# #    for i in range(rangeVocabulary):
# #        weights.append(random.randrange(0,len(vocabulary['count'])+1))
# #    for word in readed_words['count']:
# #        if word not in vocabulary['count']:
# #            readed_words['count'][word]=0
# #        readed_words_values.append(readed_words['count'][word])
# #    precision=0.00001
# #    learning_rate=0.00001
# #    delta=1
# #    max_iterations=len(vocabulary['count'])+1
# #    current_iteration=0
# #    rangeReadedValues=len(readed_words['count'])+1
# # --------------- prediction -------------------------------------
# #    while (delta>precision and current_iteration<max_iterations):
# #        y=random.choice(readed_words_values)
# #        y_hat=weights[0]
# #        i=0
# #        j=0
# #        for i in range(rangeReadedValues):
# #            y_hat+=weights[i]*y
# #            delta=abs(y_hat-y)*learning_rate
# #            weights[0]=weights[0]-delta
# #        for j in range(rangeVocabulary):
# #            weights[j]-=y*delta
# #        print(delta)
# #        current_iteration+=1
#
#
# main()