55 lines
1.6 KiB
Python
55 lines
1.6 KiB
Python
|
import pandas
|
||
|
import regex as re
|
||
|
import argparse, sys
|
||
|
|
||
|
parser=argparse.ArgumentParser()
|
||
|
parser.add_argument("--filepath",)
|
||
|
args=parser.parse_args()
|
||
|
|
||
|
FILE_PATH = args.filepath
|
||
|
|
||
|
def is_letter_sentence(text):
|
||
|
return len(re.findall("\p{L}", text)) > len(re.findall("[^\p{L}\s]", text))*2
|
||
|
|
||
|
def is_asci(text):
|
||
|
nonasci = len(re.findall("[^ -~]", text))
|
||
|
return nonasci < 5
|
||
|
|
||
|
def filter_line(line):
|
||
|
return line is not None and len(line) > 30 and is_letter_sentence(line) and is_asci(line)
|
||
|
|
||
|
def clean_with_regex(text):
|
||
|
text = str(text).encode("ascii", "ignore").decode("utf-8")
|
||
|
regex_pattern = "(?<=..\.)(\s+)(?=\(\d+\))|(?<=..\.)(\s+)(?=\d\.)|(?<=..\.)(\s+)(?=Article \d+)"
|
||
|
try:
|
||
|
out = re.split(regex_pattern, text)
|
||
|
except TypeError as e:
|
||
|
return []
|
||
|
out = list(filter(lambda item: filter_line(item), out))
|
||
|
out = list(map(lambda item: re.sub("(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", "", item), out))
|
||
|
if out:
|
||
|
out.pop(len(out)-1)
|
||
|
return out
|
||
|
|
||
|
def print_text(text, sort=False):
|
||
|
if sort:
|
||
|
text = sorted(text, key=lambda item: len(item.split(" ")))
|
||
|
for i, line in enumerate(text):
|
||
|
print(f'-----------------LINE {i}, words: {len(line.split(" "))}, length: {len(line)}-----------------')
|
||
|
print(line)
|
||
|
|
||
|
def save_to_file(paragraph_list, file_name):
|
||
|
with open(file_name, 'a') as f:
|
||
|
for line in paragraph_list:
|
||
|
f.write("%s\n" % line.strip())
|
||
|
f.close()
|
||
|
|
||
|
|
||
|
print(f"Cleaning file: {FILE_PATH}")
|
||
|
csv_file = pandas.read_csv(FILE_PATH)
|
||
|
file_directives = csv_file['act_raw_text']
|
||
|
for direcrive in file_directives:
|
||
|
paragraphs = clean_with_regex(direcrive)
|
||
|
paragraphs = [*set(paragraphs)]
|
||
|
save_to_file(paragraphs, f'out-merged.txt')
|