add preprocessing of data before predicitons

2023-05-29 23:41:36 +02:00 · 2023-05-29 23:41:36 +02:00 · e4786e2027
commit e4786e2027
parent e43a18241c
2 changed files with 11 additions and 4 deletions
--- a/application/functions/sentiment.py
+++ b/application/functions/sentiment.py
@ -1,9 +1,9 @@
 from transformers import AutoTokenizer
 from transformers import pipeline
+import re

 model = 'application/models/sentiment_model'
 tokenizer = AutoTokenizer.from_pretrained('application/tokenizers/sentiment_tokenizer')
-# tokenizer = AutoTokenizer.from_pretrained("sdadas/polish-gpt2-small")

 def sentiment_prediction(data):
    pipe = pipeline('text-classification', model=model, tokenizer = tokenizer)
@ -11,6 +11,12 @@ def sentiment_prediction(data):
    
    return result

+def clear_data(data):
+    data = [re.sub(r"[^A-Za-zżźćńółęąśŻŹĆĄŚĘŁÓŃ ']+", r"", i) for i in data['sentences']]
+    data = [i.strip() for i in data]
+    data = [i.lower() for i in data]
+    return data
+
 def count_predictions(predictions):
    l0 = 0
    l1 = 0
--- a/application/services/sentiment_service.py
+++ b/application/services/sentiment_service.py
@ -3,14 +3,15 @@ from flask import(
    jsonify, 
    Blueprint,
    )
-from application.functions.sentiment import sentiment_prediction, count_predictions
+from application.functions.sentiment import sentiment_prediction, count_predictions, clear_data

 sentiment_service = Blueprint("sentiment_service", __name__)    

@sentiment_service.route("/get_sentiment_data", methods=['POST'])
 def get_data():
    data = request.get_json()
-    predicitons = sentiment_prediction(data['sentences']) #predykcje
+    data_clear = clear_data(data)                       #czyszczenie danych wejsciowych
+    predicitons = sentiment_prediction(data_clear)      #predykcje
    count_labels = count_predictions(predicitons)       #dane do wykresu

    for i in range(0, len(predicitons)):