s444507-modelowanie-jezyka/Lab1/clean.py

56 lines
1.7 KiB
Python
Raw Permalink Normal View History

2023-03-16 02:29:34 +01:00
import pandas
import regex as re
2023-03-29 04:08:59 +02:00
import argparse
2023-03-16 02:29:34 +01:00
parser=argparse.ArgumentParser()
parser.add_argument("--filepath",)
args=parser.parse_args()
FILE_PATH = args.filepath
def is_letter_sentence(text):
return len(re.findall("\p{L}", text)) > len(re.findall("[^\p{L}\s]", text))*2
def is_asci(text):
nonasci = len(re.findall("[^ -~]", text))
return nonasci < 5
def filter_line(line):
return line is not None and len(line) > 30 and is_letter_sentence(line) and is_asci(line)
def clean_with_regex(text):
2023-03-29 04:08:59 +02:00
# text = str(text).encode("ascii", "ignore").decode("utf-8")
regex_pattern = r"(?<=..\.)(\s+)(?=\(\d+\))|(?<=..\.)(\s+)(?=\d\.)|(?<=..\.)(\s+)(?=Article \d+)"
2023-03-16 02:29:34 +01:00
try:
out = re.split(regex_pattern, text)
except TypeError as e:
return []
out = list(filter(lambda item: filter_line(item), out))
2023-03-29 04:08:59 +02:00
out = list(map(lambda item: re.sub(r"(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", " ", item), out))
out = list(map(lambda item: re.sub(r"[^\w\d\s\\\)\(\/-]|[^\x00-\x7F]|ex\d+", " ", item), out))
2023-03-16 02:29:34 +01:00
if out:
out.pop(len(out)-1)
return out
def print_text(text, sort=False):
if sort:
text = sorted(text, key=lambda item: len(item.split(" ")))
for i, line in enumerate(text):
print(f'-----------------LINE {i}, words: {len(line.split(" "))}, length: {len(line)}-----------------')
print(line)
def save_to_file(paragraph_list, file_name):
with open(file_name, 'a') as f:
for line in paragraph_list:
2023-03-22 04:32:34 +01:00
f.write("%s\n" % line.strip().lower())
2023-03-16 02:29:34 +01:00
f.close()
print(f"Cleaning file: {FILE_PATH}")
csv_file = pandas.read_csv(FILE_PATH)
file_directives = csv_file['act_raw_text']
for direcrive in file_directives:
paragraphs = clean_with_regex(direcrive)
paragraphs = [*set(paragraphs)]
save_to_file(paragraphs, f'out-merged.txt')