import random import re from _collections import defaultdict def define_vocabulary(file_to_learn_new_words): word_counts = {'count': defaultdict(int)} with open(file_to_learn_new_words, encoding='utf-8') as in_file: for line in in_file: text, timestamp = line.rstrip('\n').split('\t') tokens = text.lower().split(' ') for token in tokens: word_counts['count'][token] += 1 in_file.close() return word_counts def tokenize_list(string_input): words=[] string=string_input.replace('\\n',' ') #text=re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string) text = re.sub(r'\\n+', " ", string) text = re.sub(r'http\S+', " ", text) text = re.sub(r'\/[a-z]\/', " ", text) text = re.sub(r'[^a-z]', " ", text) text = re.sub(r'\s{2,}', " ", text) text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text) text = re.sub(r'^\s', "", text) string='' for word in text: string+=word words=re.split(';+|,+|\*+|\n+| +|\_+|\%+|\t+|\[+|\]+|\.+|\(+|\)+|\++|\\+|\/+|[0-9]+|\#+|\'+|\"+|\-+|\=+|\&+|\:+|\?+|\!+|\^+|\ยท+',string) regex=re.compile(r'http|^[a-zA-Z]$|org') filtered_values=[ word for word in words if not regex.match(word) ] filtered_values[:] = ( value.lower() for value in filtered_values if len(value)!=0 ) return filtered_values def read_words(input_path): vocabulary = {'count':defaultdict(int)} index=0 with open(input_path,encoding='utf-8') as infile: for line in infile: index+=1 tokens = tokenize_list(line) for token in tokens: if token not in vocabulary: vocabulary['vocabulary'][token]+=1 infile.close() return vocabulary def train(vocabulary,input_train,expected_train): learning_rate=0.00001 #learning_precision=0.0000001 words_vocabulary={} with open(input_train,encoding='utf-8') as input_file, open(expected_train,encoding='utf-8') as expected_file: for line, exp in zip(input_file,expected_file): words_vocabulary[line]=int(exp) weights={} weight={} delta=1 iteration=0 loss_sum=0.0 error=10.0 max_iteration=len(vocabulary) + 1000 for i in vocabulary['count'].keys(): weights[i]=random.uniform(-0.01,0.01) # delta>learning_precision and while iteration(loss_sum/1000)): weight=weights error=loss_sum/1000 loss_sum=0.0 iteration += 1 input_file.close() expected_file.close() return weight, vocabulary def prediction(input,output,weights,vocabulary): with open(input,encoding='utf-8') as input_file, open(output,'w+',encoding='utf-8') as output: for line in input_file: y_hat=0 tokens=tokenize_list(line) for token in tokens: if token in vocabulary['count'].keys(): y_hat += weights[token] * (token.count(token)) if y_hat>0.0: output.write('1\n') else: output.write('0\n') output.close() input_file.close() def main(): vocabulary=define_vocabulary('train/in.tsv'); weights, words = train(vocabulary,'train/in.tsv','train/expected.tsv') prediction('dev-0/in.tsv','dev-0/out.tsv',weights,words) prediction('test-A/in.tsv','test-A/out.tsv',weights,words) main()