s444507-modelowanie-jezyka/Lab1/clean.py

import pandas
import regex as re
import argparse

parser=argparse.ArgumentParser()
parser.add_argument("--filepath",)
args=parser.parse_args()

FILE_PATH = args.filepath

def is_letter_sentence(text):
  return len(re.findall("\p{L}", text)) > len(re.findall("[^\p{L}\s]", text))*2

def is_asci(text):
  nonasci = len(re.findall("[^ -~]", text))
  return nonasci < 5

def filter_line(line):
  return line is not None and len(line) > 30 and is_letter_sentence(line) and is_asci(line)

def clean_with_regex(text):
  # text = str(text).encode("ascii", "ignore").decode("utf-8")
  regex_pattern = r"(?<=..\.)(\s+)(?=\(\d+\))|(?<=..\.)(\s+)(?=\d\.)|(?<=..\.)(\s+)(?=Article \d+)"
  try:
    out = re.split(regex_pattern, text)
  except TypeError as e:
    return []
  out = list(filter(lambda item: filter_line(item), out))
  out = list(map(lambda item: re.sub(r"(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", " ", item), out))
  out = list(map(lambda item: re.sub(r"[^\w\d\s\\\)\(\/-]|[^\x00-\x7F]|ex\d+", " ", item), out))
  if out:
    out.pop(len(out)-1)
  return out

def print_text(text, sort=False):
  if sort:
    text = sorted(text, key=lambda item: len(item.split(" ")))
  for i, line in enumerate(text):
    print(f'-----------------LINE {i}, words: {len(line.split(" "))}, length: {len(line)}-----------------')
    print(line)

def save_to_file(paragraph_list, file_name):
  with open(file_name, 'a') as f:
    for line in paragraph_list:
      f.write("%s\n" % line.strip().lower())
    f.close()


print(f"Cleaning file: {FILE_PATH}")
csv_file = pandas.read_csv(FILE_PATH)
file_directives = csv_file['act_raw_text']
for direcrive in file_directives:
  paragraphs = clean_with_regex(direcrive)
  paragraphs = [*set(paragraphs)]
  save_to_file(paragraphs, f'out-merged.txt')
lab1 2023-03-16 02:29:34 +01:00			`import pandas`
			`import regex as re`
lab3 2023-03-29 04:08:59 +02:00			`import argparse`
lab1 2023-03-16 02:29:34 +01:00
			`parser=argparse.ArgumentParser()`
			`parser.add_argument("--filepath",)`
			`args=parser.parse_args()`

			`FILE_PATH = args.filepath`

			`def is_letter_sentence(text):`
			`return len(re.findall("\p{L}", text)) > len(re.findall("[^\p{L}\s]", text))*2`

			`def is_asci(text):`
			`nonasci = len(re.findall("[^ -~]", text))`
			`return nonasci < 5`

			`def filter_line(line):`
			`return line is not None and len(line) > 30 and is_letter_sentence(line) and is_asci(line)`

			`def clean_with_regex(text):`
lab3 2023-03-29 04:08:59 +02:00			`# text = str(text).encode("ascii", "ignore").decode("utf-8")`
			`regex_pattern = r"(?<=..\.)(\s+)(?=\(\d+\))\|(?<=..\.)(\s+)(?=\d\.)\|(?<=..\.)(\s+)(?=Article \d+)"`
lab1 2023-03-16 02:29:34 +01:00			`try:`
			`out = re.split(regex_pattern, text)`
			`except TypeError as e:`
			`return []`
			`out = list(filter(lambda item: filter_line(item), out))`
lab3 2023-03-29 04:08:59 +02:00			`out = list(map(lambda item: re.sub(r"(?<=\d)(\(\d+\))(?=\s+)\|(\(\d+\)\s+)\|(\d+\.)+\s", " ", item), out))`
			`out = list(map(lambda item: re.sub(r"[^\w\d\s\\\)\(\/-]\|[^\x00-\x7F]\|ex\d+", " ", item), out))`
lab1 2023-03-16 02:29:34 +01:00			`if out:`
			`out.pop(len(out)-1)`
			`return out`

			`def print_text(text, sort=False):`
			`if sort:`
			`text = sorted(text, key=lambda item: len(item.split(" ")))`
			`for i, line in enumerate(text):`
			`print(f'-----------------LINE {i}, words: {len(line.split(" "))}, length: {len(line)}-----------------')`
			`print(line)`

			`def save_to_file(paragraph_list, file_name):`
			`with open(file_name, 'a') as f:`
			`for line in paragraph_list:`
statistics script 2023-03-22 04:32:34 +01:00			`f.write("%s\n" % line.strip().lower())`
lab1 2023-03-16 02:29:34 +01:00			`f.close()`


			`print(f"Cleaning file: {FILE_PATH}")`
			`csv_file = pandas.read_csv(FILE_PATH)`
			`file_directives = csv_file['act_raw_text']`
			`for direcrive in file_directives:`
			`paragraphs = clean_with_regex(direcrive)`
			`paragraphs = [*set(paragraphs)]`
			`save_to_file(paragraphs, f'out-merged.txt')`