#!/usr/bin/env python3 import pickle import fileinput import random import math import random import re from _collections import defaultdict def define_vocabulary(file_to_learn_new_words): word_counts = {'count': defaultdict(int)} with open(file_to_learn_new_words, encoding='utf-8') as in_file: for line in in_file: text, timestamp = line.rstrip('\n').split('\t') tokens = text.lower().split(' ') for token in tokens: word_counts['count'][token] += 1 in_file.close() return word_counts def tokenize_list(string_input): words=[] string=string_input.replace('\\n',' ') text=re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string) string='' for word in text: string+=word words=re.split(';+|,+|\*+|\n+| +|\_+|\%+|\t+|\[+|\]+|\.+|\(+|\)+|\++|\\+|\/+|[0-9]+|\#+|\'+|\"+|\-+|\=+|\&+|\:+|\?+|\!+|\^+|\ยท+',string) regex=re.compile(r'http|^[a-zA-Z]$|org') filtered_values=[ word for word in words if not regex.match(word) ] filtered_values[:] = ( value.lower() for value in filtered_values if len(value)!=0 ) return filtered_values def train(vocabulary,input_train,expected_train): learning_rate=0.001 learning_precision=0.00001 words_vocabulary={} with open(input_train,encoding='utf-8') as input_file, open(expected_train,encoding='utf-8') as expected_file: for line, exp in zip(input_file,expected_file): words_vocabulary[line]=int(exp) weights={} weight={} delta=1 iteration=0 loss_sum=0.0 error=10.0 max_iteration=len(vocabulary) for i in vocabulary['count'].keys(): weights[i]=random.uniform(-0.01,0.01) while delta>learning_precision and iteration(loss_sum/1000)): weight=weights error=loss_sum/1000 loss_sum=0.0 iteration += 1 input_file.close() expected_file.close() return weight, vocabulary def prediction(input,output,weights,vocabulary): with open(input,encoding='utf-8') as input_file, open(output,'w+',encoding='utf-8') as output: for line in input_file: y_hat=0 tokens=tokenize_list(line) for token in tokens: if token in vocabulary['count'].keys(): y_hat += weights[token] * (token.count(token)) if y_hat>0.0: output.write('1\n') else: output.write('0\n') output.close() input_file.close() vocabulary=define_vocabulary('train/in.tsv'); weights, words = train(vocabulary,'train/in.tsv','train/expected.tsv') prediction('dev-0/in.tsv','dev-0/out.tsv',weights,words) prediction('test-A/in.tsv','test-A/out.tsv',weights,words)