diff --git a/Dockerfile b/Dockerfile index 680f6aa..d0097ad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:20.04 +FROM ubuntu:latest RUN apt update && apt install -y python3 python3-pip diff --git a/main.py b/main.py index 37887bb..dc82408 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,20 @@ import string import pandas as pd from sklearn.model_selection import train_test_split +import nltk +nltk.download('stopwords') +from nltk.corpus import stopwords + + +def remove_punct(text): + translator = str.maketrans("", "", string.punctuation) + return text.translate(translator) + + +stop = set(stopwords.words("english")) +def remove_stopwords(text): + filtered_words = [word.lower() for word in text.split() if word.lower() not in stop] + return " ".join(filtered_words) def main():