From a587a380124bf7e2667675258ee909d4081f8c4c Mon Sep 17 00:00:00 2001 From: "sadurska@trui.pl" Date: Sun, 16 May 2021 22:03:44 +0200 Subject: [PATCH] Multipipeline #wip --- Dockerfile | 2 +- main.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 680f6aa..d0097ad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:20.04 +FROM ubuntu:latest RUN apt update && apt install -y python3 python3-pip diff --git a/main.py b/main.py index 37887bb..dc82408 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,20 @@ import string import pandas as pd from sklearn.model_selection import train_test_split +import nltk +nltk.download('stopwords') +from nltk.corpus import stopwords + + +def remove_punct(text): + translator = str.maketrans("", "", string.punctuation) + return text.translate(translator) + + +stop = set(stopwords.words("english")) +def remove_stopwords(text): + filtered_words = [word.lower() for word in text.split() if word.lower() not in stop] + return " ".join(filtered_words) def main():