This commit is contained in:
Bartosz Karwacki 2022-04-03 20:07:45 +02:00
parent 2be2a96fe1
commit e49b8826cb
3 changed files with 17700 additions and 17712 deletions

File diff suppressed because it is too large Load Diff

16
run.py
View File

@ -30,20 +30,8 @@ model = defaultdict(lambda: defaultdict(lambda: 0))
def clean(text): def clean(text):
text = str(text) text = str(text).lower().replace("-\\n", "").replace("\\n", " ")
text = ( return re.sub(r"\p{P}", "", text)
unicodedata.normalize("NFKD", text)
.encode("ascii", "ignore")
.decode("utf-8", "ignore")
)
text = re.sub("<.*?>", " ", text)
text = text.translate(str.maketrans(" ", " ", string.punctuation))
text = re.sub("[^a-zA-Z]", " ", text)
text = re.sub("\n", " ", text)
text = text.lower()
text = " ".join(text.split())
return text
def train_model(data): def train_model(data):
for _, row in data.iterrows(): for _, row in data.iterrows():

File diff suppressed because it is too large Load Diff