From e4786e20272138945772a8a2a14bf422f39367e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=9Acigacz?= Date: Mon, 29 May 2023 23:41:36 +0200 Subject: [PATCH] add preprocessing of data before predicitons --- application/functions/sentiment.py | 8 +++++++- application/services/sentiment_service.py | 7 ++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/application/functions/sentiment.py b/application/functions/sentiment.py index 5166186..eec19e6 100644 --- a/application/functions/sentiment.py +++ b/application/functions/sentiment.py @@ -1,9 +1,9 @@ from transformers import AutoTokenizer from transformers import pipeline +import re model = 'application/models/sentiment_model' tokenizer = AutoTokenizer.from_pretrained('application/tokenizers/sentiment_tokenizer') -# tokenizer = AutoTokenizer.from_pretrained("sdadas/polish-gpt2-small") def sentiment_prediction(data): pipe = pipeline('text-classification', model=model, tokenizer = tokenizer) @@ -11,6 +11,12 @@ def sentiment_prediction(data): return result +def clear_data(data): + data = [re.sub(r"[^A-Za-zżźćńółęąśŻŹĆĄŚĘŁÓŃ ']+", r"", i) for i in data['sentences']] + data = [i.strip() for i in data] + data = [i.lower() for i in data] + return data + def count_predictions(predictions): l0 = 0 l1 = 0 diff --git a/application/services/sentiment_service.py b/application/services/sentiment_service.py index e86e504..3fda909 100644 --- a/application/services/sentiment_service.py +++ b/application/services/sentiment_service.py @@ -3,15 +3,16 @@ from flask import( jsonify, Blueprint, ) -from application.functions.sentiment import sentiment_prediction, count_predictions +from application.functions.sentiment import sentiment_prediction, count_predictions, clear_data sentiment_service = Blueprint("sentiment_service", __name__) @sentiment_service.route("/get_sentiment_data", methods=['POST']) def get_data(): data = request.get_json() - predicitons = sentiment_prediction(data['sentences']) #predykcje - count_labels = count_predictions(predicitons) #dane do wykresu + data_clear = clear_data(data) #czyszczenie danych wejsciowych + predicitons = sentiment_prediction(data_clear) #predykcje + count_labels = count_predictions(predicitons) #dane do wykresu for i in range(0, len(predicitons)): predicitons[i]['sentence'] = data['sentences'][i]