From 2f46e045b149397d79875378330cb21559e3ee0e Mon Sep 17 00:00:00 2001 From: s152483 Date: Fri, 10 Apr 2020 22:16:59 +0000 Subject: [PATCH] =?UTF-8?q?Usu=C5=84=20'program=5Flr.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- program_lr.py | 98 --------------------------------------------------- 1 file changed, 98 deletions(-) delete mode 100644 program_lr.py diff --git a/program_lr.py b/program_lr.py deleted file mode 100644 index 7c938ee..0000000 --- a/program_lr.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python3 -import pickle -import fileinput -import random -import math -import random -import re -from _collections import defaultdict - -def define_vocabulary(file_to_learn_new_words): - word_counts = {'count': defaultdict(int)} - with open(file_to_learn_new_words, encoding='utf-8') as in_file: - for line in in_file: - text, timestamp = line.rstrip('\n').split('\t') - tokens = text.lower().split(' ') - for token in tokens: - word_counts['count'][token] += 1 - in_file.close() - return word_counts - -def tokenize_list(string_input): - words=[] - string=string_input.replace('\\n',' ') - text=re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string) - string='' - for word in text: - string+=word - words=re.split(';+|,+|\*+|\n+| +|\_+|\%+|\t+|\[+|\]+|\.+|\(+|\)+|\++|\\+|\/+|[0-9]+|\#+|\'+|\"+|\-+|\=+|\&+|\:+|\?+|\!+|\^+|\ยท+',string) - regex=re.compile(r'http|^[a-zA-Z]$|org') - filtered_values=[ - word - for word in words if not regex.match(word) - ] - filtered_values[:] = ( - value.lower() - for value in filtered_values if len(value)!=0 - ) - return filtered_values - -def train(vocabulary,input_train,expected_train): - learning_rate=0.001 - learning_precision=0.00001 - words_vocabulary={} - with open(input_train,encoding='utf-8') as input_file, open(expected_train,encoding='utf-8') as expected_file: - for line, exp in zip(input_file,expected_file): - words_vocabulary[line]=int(exp) - weights={} - weight={} - delta=1 - iteration=0 - loss_sum=0.0 - error=10.0 - max_iteration=len(vocabulary) - for i in vocabulary['count'].keys(): - weights[i]=random.uniform(-0.01,0.01) - while delta>learning_precision and iteration(loss_sum/1000)): - weight=weights - error=loss_sum/1000 - loss_sum=0.0 - iteration += 1 - input_file.close() - expected_file.close() - return weight, vocabulary - -def prediction(input,output,weights,vocabulary): - with open(input,encoding='utf-8') as input_file, open(output,'w+',encoding='utf-8') as output: - for line in input_file: - y_hat=0 - tokens=tokenize_list(line) - for token in tokens: - if token in vocabulary['count'].keys(): - y_hat += weights[token] * (token.count(token)) - if y_hat>0.0: - output.write('1\n') - else: - output.write('0\n') - output.close() - input_file.close() - -vocabulary=define_vocabulary('train/in.tsv'); -weights, words = train(vocabulary,'train/in.tsv','train/expected.tsv') -prediction('dev-0/in.tsv','dev-0/out.tsv',weights,words) -prediction('test-A/in.tsv','test-A/out.tsv',weights,words) -