import pickle import re def calculate_words(linetxt): word_counts = {} tokens = linetxt.split(' ') for token in tokens: if token in word_counts.keys(): word_counts[token]+=1 else: word_counts[token]=1 word_counts[''] = 1 return word_counts def tokenize_list(string_input): string=string_input.replace('\\n',' ') text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string) text = re.sub(r'\\n+', " ", text) text = re.sub(r'http\S+', " ", text) text = re.sub(r'\/[a-z]\/', " ", text) text = re.sub(r'[^a-z]', " ", text) text = re.sub(r'\s{2,}', " ", text) text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text) text = re.sub(r'^\s', "", text) return text def prediction(input,output): loaded_model = pickle.load(open('model_linear_reg.pkl','rb')) #print(loaded_model) weights, word, vocabulary = loaded_model #print("WORD: ") #print(word) #print(" WEIGHTS: ") #print(weights) output_f = open(output,'w') with open(input, encoding='utf-8') as input_f: for line in input_f: text, timestamp = line.rstrip('\n').split('\t') tokens = tokenize_list(text.lower()) line_vocabulary = calculate_words(tokens) tokens = tokens.split(' ') y_hat = weights[0] for token in tokens: if token in vocabulary.keys(): y_hat += weights[word[token]] * line_vocabulary[token] if y_hat > 0.5: output_f.write("1\n") #print(y_hat) else: output_f.write("0\n") #print(y_hat) output_f.close() def main(): prediction("dev-0/in.tsv","dev-0/out.tsv") prediction("test-A/in.tsv","test-A/out.tsv") main()