diff --git a/src/train.py b/src/train.py new file mode 100644 index 0000000..799e85f --- /dev/null +++ b/src/train.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +# an LSTM language model trained on sentence pairs + +import argparse +from collection import Counter + +def clear_line(string, target): + return re.sub("[^a-z ]", "", string.lower()), re.sub("[^a-z ]", "", target.lower()) + +def read_clear_data(in_file_path, exptected_file_path): + print("Reading data") + source_data = [] + target_data = [] + with open(in_file_path) as in_file, open(exptected_file_path) as exp_file: + for string, target in zip(in_file, exp_file): + string, target = clear_line(string, target) + source_data.appen(string) + target_data.appen(target) + return source_data, target_data + +def create_dict(data): + counter = Counter() + + for line in data: + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--in_f') + parser.add_argument('--exp') + parser.add_argument("--vocab") + args = parser.parse_args() + + source_data, target_data = read_clear_data(args.in_f, args.exp) + + +main()