2023-03-16 02:29:34 +01:00
|
|
|
import pandas
|
|
|
|
import regex as re
|
2023-03-29 04:08:59 +02:00
|
|
|
import argparse
|
2023-03-16 02:29:34 +01:00
|
|
|
|
|
|
|
parser=argparse.ArgumentParser()
|
|
|
|
parser.add_argument("--filepath",)
|
|
|
|
args=parser.parse_args()
|
|
|
|
|
|
|
|
FILE_PATH = args.filepath
|
|
|
|
|
|
|
|
def is_letter_sentence(text):
|
|
|
|
return len(re.findall("\p{L}", text)) > len(re.findall("[^\p{L}\s]", text))*2
|
|
|
|
|
|
|
|
def is_asci(text):
|
|
|
|
nonasci = len(re.findall("[^ -~]", text))
|
|
|
|
return nonasci < 5
|
|
|
|
|
|
|
|
def filter_line(line):
|
|
|
|
return line is not None and len(line) > 30 and is_letter_sentence(line) and is_asci(line)
|
|
|
|
|
|
|
|
def clean_with_regex(text):
|
2023-03-29 04:08:59 +02:00
|
|
|
# text = str(text).encode("ascii", "ignore").decode("utf-8")
|
|
|
|
regex_pattern = r"(?<=..\.)(\s+)(?=\(\d+\))|(?<=..\.)(\s+)(?=\d\.)|(?<=..\.)(\s+)(?=Article \d+)"
|
2023-03-16 02:29:34 +01:00
|
|
|
try:
|
|
|
|
out = re.split(regex_pattern, text)
|
|
|
|
except TypeError as e:
|
|
|
|
return []
|
|
|
|
out = list(filter(lambda item: filter_line(item), out))
|
2023-03-29 04:08:59 +02:00
|
|
|
out = list(map(lambda item: re.sub(r"(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", " ", item), out))
|
|
|
|
out = list(map(lambda item: re.sub(r"[^\w\d\s\\\)\(\/-]|[^\x00-\x7F]|ex\d+", " ", item), out))
|
2023-03-16 02:29:34 +01:00
|
|
|
if out:
|
|
|
|
out.pop(len(out)-1)
|
|
|
|
return out
|
|
|
|
|
|
|
|
def print_text(text, sort=False):
|
|
|
|
if sort:
|
|
|
|
text = sorted(text, key=lambda item: len(item.split(" ")))
|
|
|
|
for i, line in enumerate(text):
|
|
|
|
print(f'-----------------LINE {i}, words: {len(line.split(" "))}, length: {len(line)}-----------------')
|
|
|
|
print(line)
|
|
|
|
|
|
|
|
def save_to_file(paragraph_list, file_name):
|
|
|
|
with open(file_name, 'a') as f:
|
|
|
|
for line in paragraph_list:
|
2023-03-22 04:32:34 +01:00
|
|
|
f.write("%s\n" % line.strip().lower())
|
2023-03-16 02:29:34 +01:00
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
|
|
print(f"Cleaning file: {FILE_PATH}")
|
|
|
|
csv_file = pandas.read_csv(FILE_PATH)
|
|
|
|
file_directives = csv_file['act_raw_text']
|
|
|
|
for direcrive in file_directives:
|
|
|
|
paragraphs = clean_with_regex(direcrive)
|
|
|
|
paragraphs = [*set(paragraphs)]
|
|
|
|
save_to_file(paragraphs, f'out-merged.txt')
|