commit 6a24c1a4bc3e5b895432d7c82b3fff06d3fa71ee Author: Mateusz Date: Sun Jun 9 16:41:14 2024 +0200 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..accd1fd --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +twitter_training.csv +twitter_validation.csv +twitter-entity-sentiment-analysis.zip \ No newline at end of file diff --git a/464913.docx b/464913.docx new file mode 100644 index 0000000..665fe82 Binary files /dev/null and b/464913.docx differ diff --git a/6_Projekt.ipynb b/6_Projekt.ipynb new file mode 100644 index 0000000..96d86ae --- /dev/null +++ b/6_Projekt.ipynb @@ -0,0 +1,1576 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "5K66MlDpZDnE" + }, + "source": [ + "## Analiza sentymentu w opiniach z Twitter'a\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OaqaYfQFZDnH" + }, + "source": [ + "### Download dataset and prepare data\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "23k83t7RNCJa" + }, + "source": [ + "#### Installation of packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "I5pSpk6PNCJb", + "outputId": "3f30ecd9-104a-496a-fd52-447f4d64e814" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.0.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.4)\n", + "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.1)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.25.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", + "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", + "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.25.2)\n", + "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.11.4)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.5.0)\n", + "Requirement already satisfied: emoji in /usr/local/lib/python3.10/dist-packages (2.12.1)\n", + "Requirement already satisfied: typing-extensions>=4.7.0 in /usr/local/lib/python3.10/dist-packages (from emoji) (4.12.1)\n", + "Requirement already satisfied: gensim in /usr/local/lib/python3.10/dist-packages (4.3.2)\n", + "Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.10/dist-packages (from gensim) (1.25.2)\n", + "Requirement already satisfied: scipy>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from gensim) (1.11.4)\n", + "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim) (6.4.0)\n" + ] + } + ], + "source": [ + "%pip install pandas\n", + "%pip install scikit-learn\n", + "%pip install emoji\n", + "%pip install gensim" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FA_aZGAkNCJd" + }, + "source": [ + "#### Importing libraries\n" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "id": "yQvOCaX2NCJd" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "import emoji\n", + "from gensim.utils import simple_preprocess" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gp8ITdbPNCJe" + }, + "source": [ + "#### Download the dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DlcNiu4UNCJe", + "outputId": "015b3ad1-6b9d-4845-dd98-0b0c085b12c9" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Dataset URL: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis\n", + "License(s): CC0-1.0\n", + "twitter-entity-sentiment-analysis.zip: Skipping, found more recently modified local copy (use --force to force download)\n" + ] + } + ], + "source": [ + "!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yU4XFDrUNCJf" + }, + "source": [ + "#### Unzip the dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "G2gaml-MNCJf", + "outputId": "e327c071-a0cd-480f-92d3-66388fd4dfcb" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Archive: twitter-entity-sentiment-analysis.zip\n", + " inflating: twitter_training.csv \n", + " inflating: twitter_validation.csv \n" + ] + } + ], + "source": [ + "!unzip -o twitter-entity-sentiment-analysis.zip" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bBO6YHwyNCJg" + }, + "source": [ + "#### Load the dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "id": "9KlnXJTtNCJg" + }, + "outputs": [], + "source": [ + "cols = [\"tweetid\", \"entity\", \"sentiment\", \"content\"]\n", + "twitter_training = pd.read_csv(\"twitter_training.csv\", names=cols)\n", + "twitter_validation = pd.read_csv(\"twitter_validation.csv\", names=cols)\n", + "dataset = pd.concat([twitter_training, twitter_validation])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XIslo9YQNCJg" + }, + "source": [ + "#### Info about the dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rnh5-0SZNCJh", + "outputId": "99319b5c-f4e2-4aee-e963-13e8d2e938ee" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Index: 75682 entries, 0 to 999\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 tweetid 75682 non-null int64 \n", + " 1 entity 75682 non-null object\n", + " 2 sentiment 75682 non-null object\n", + " 3 content 74996 non-null object\n", + "dtypes: int64(1), object(3)\n", + "memory usage: 2.9+ MB\n" + ] + } + ], + "source": [ + "dataset.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rpHNMU57NCJh", + "outputId": "576fba81-c5fc-47ee-aae9-4f1734081e97" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(75682, 4)" + ] + }, + "metadata": {}, + "execution_count": 104 + } + ], + "source": [ + "dataset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eylMuu0GNCJj", + "outputId": "d04a8e0a-42ac-4f70-f277-5b9300e97016" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "sentiment\n", + "Negative 22808\n", + "Positive 21109\n", + "Neutral 18603\n", + "Irrelevant 13162\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 105 + } + ], + "source": [ + "dataset[\"sentiment\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fm7H57JINCJj", + "outputId": "6af989a7-c3e7-4666-afef-c2859265d027" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "tweetid 0\n", + "entity 0\n", + "sentiment 0\n", + "content 686\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 106 + } + ], + "source": [ + "dataset.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AF_ZNH6pNCJk", + "outputId": "f3191e1e-1176-4c08-9c3e-31eee38020d8" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "3217" + ] + }, + "metadata": {}, + "execution_count": 107 + } + ], + "source": [ + "dataset.duplicated().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LeIs8ceHNCJl" + }, + "source": [ + "#### Prepare the dataset\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GJfxQkbWNCJl" + }, + "source": [ + "##### Drop tweetid and entity columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "id": "X3GwAqSQNCJl" + }, + "outputs": [], + "source": [ + "dataset = dataset.drop(columns=[\"tweetid\", \"entity\"], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WpBDzbx6NCJm" + }, + "source": [ + "##### Drop null values\n" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": { + "id": "ixlmP6cwNCJm" + }, + "outputs": [], + "source": [ + "dataset.dropna(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Z_0UNES2NCJm" + }, + "source": [ + "##### Remove emojis\n" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": { + "id": "I9Mr8rAQNCJm" + }, + "outputs": [], + "source": [ + "dataset[\"content\"] = dataset[\"content\"].apply(\n", + " lambda x: emoji.replace_emoji(x, replace=\"\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wLptSqrzNCJm" + }, + "source": [ + "##### Simple Preprocess\n" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": { + "id": "gw8HC9XBNCJm" + }, + "outputs": [], + "source": [ + "dataset[\"content\"] = dataset[\"content\"].apply(lambda x: \" \".join(simple_preprocess(x)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vayfwdnkNCJm" + }, + "source": [ + "##### Drop null values\n" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": { + "id": "6r1_Hk1JNCJn" + }, + "outputs": [], + "source": [ + "dataset.dropna(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k2aDiHxrNCJn" + }, + "source": [ + "##### Drop duplicates\n" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": { + "id": "56YaoLvjNCJn" + }, + "outputs": [], + "source": [ + "dataset.drop_duplicates(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "exgWXEmNNCJn" + }, + "source": [ + "#### Info about the dataset after cleaning\n" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1oaFMRqANCJn", + "outputId": "05560ac6-9dd5-4397-8730-59239af28fc6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Index: 65839 entries, 0 to 991\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sentiment 65839 non-null object\n", + " 1 content 65839 non-null object\n", + "dtypes: object(2)\n", + "memory usage: 1.5+ MB\n" + ] + } + ], + "source": [ + "dataset.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "92f8IAAINCJo", + "outputId": "383826e9-6f8b-4e66-c7f9-2efacb8a5c96" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(65839, 2)" + ] + }, + "metadata": {}, + "execution_count": 115 + } + ], + "source": [ + "dataset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "F7a05XcCNCJo", + "outputId": "4b189aa3-8df7-44be-aa20-9066d4cde04a" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "sentiment\n", + "Negative 20147\n", + "Positive 17868\n", + "Neutral 16193\n", + "Irrelevant 11631\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 116 + } + ], + "source": [ + "dataset[\"sentiment\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GG3Qgk44NCJo", + "outputId": "4959a695-513d-47e0-cbe9-b05d149478cf" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "sentiment 0\n", + "content 0\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 117 + } + ], + "source": [ + "dataset.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "u5g9cVa1NCJo", + "outputId": "214e37d0-71b0-4616-fb37-2b47e365ee14" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0" + ] + }, + "metadata": {}, + "execution_count": 118 + } + ], + "source": [ + "dataset.duplicated().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eVZuSi-SNCJo" + }, + "source": [ + "#### Split the dataset into training and testing sets\n" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": { + "id": "BCy_q1GHNCJp" + }, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " dataset[\"content\"], dataset[\"sentiment\"], test_size=0.2, random_state=0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-qSKofcjNCJt", + "outputId": "d2e09e82-4174-4e87-a0be-5e9158668733" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "((52671,), (13168,), (52671,), (13168,))" + ] + }, + "metadata": {}, + "execution_count": 120 + } + ], + "source": [ + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UmMEl5AYNCJt" + }, + "source": [ + "### TD-IDF - Logistic Regression\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2_scDhXqZDnJ" + }, + "source": [ + "#### Importing libraries\n" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": { + "id": "ugm_fVSiZDnK" + }, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "X8DY5eStNCJu" + }, + "source": [ + "#### Text Vectorization Using TF-IDF\n" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": { + "id": "IBAy8zjcNCJu" + }, + "outputs": [], + "source": [ + "vectorizer = TfidfVectorizer()\n", + "X_train_tfidf = vectorizer.fit_transform(X_train)\n", + "X_test_tfidf = vectorizer.transform(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rS5pptZINCJu" + }, + "source": [ + "#### Training a Logistic Regression model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 75 + }, + "id": "m3tmiTWVNCJu", + "outputId": "e9372b78-9ea9-4a4a-8289-f3e9ee6ba511" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "LogisticRegression(max_iter=1000)" + ], + "text/html": [ + "
LogisticRegression(max_iter=1000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 123 + } + ], + "source": [ + "model = LogisticRegression(solver=\"lbfgs\", penalty=\"l2\", max_iter=1000)\n", + "model.fit(X_train_tfidf, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GiY_P6PKNCJu" + }, + "source": [ + "#### Predicting\n" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": { + "id": "CJ_9qh6ONCJu" + }, + "outputs": [], + "source": [ + "y_pred = model.predict(X_test_tfidf)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "koeb78PsNCJu" + }, + "source": [ + "#### Classification report\n" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hsABx8mJNCJv", + "outputId": "c4c23ca6-c88a-4db9-fe66-36a7febd3594" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " Irrelevant 0.82 0.70 0.75 2304\n", + " Negative 0.80 0.86 0.83 4024\n", + " Neutral 0.79 0.74 0.77 3169\n", + " Positive 0.78 0.82 0.80 3671\n", + "\n", + " accuracy 0.79 13168\n", + " macro avg 0.80 0.78 0.79 13168\n", + "weighted avg 0.79 0.79 0.79 13168\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y17ccTy1NCJv" + }, + "source": [ + "### TD-IDF - Random Forest Classifier\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yyk_baF-NCJv" + }, + "source": [ + "#### Importing libraries\n" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": { + "id": "-xjXLHpQNCJv" + }, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Tl6mOx92NCJw" + }, + "source": [ + "#### Text Vectorization Using TF-IDF\n" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": { + "id": "bE9h15BcNCJw" + }, + "outputs": [], + "source": [ + "vectorizer = TfidfVectorizer()\n", + "X_train_tfidf = vectorizer.fit_transform(X_train)\n", + "X_test_tfidf = vectorizer.transform(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cNUGJWXINCJw" + }, + "source": [ + "#### Training a Random Forest Classifier model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 75 + }, + "id": "WTrPtycbNCJw", + "outputId": "e97b690c-f698-414a-cc40-4843d12e2073" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "RandomForestClassifier()" + ], + "text/html": [ + "
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 128 + } + ], + "source": [ + "model = RandomForestClassifier(criterion=\"gini\")\n", + "model.fit(X_train_tfidf, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HPlAbp8PNCJx" + }, + "source": [ + "#### Predicting\n" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": { + "id": "0ePAr1uZNCJx" + }, + "outputs": [], + "source": [ + "y_pred = model.predict(X_test_tfidf)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oPnSaSB-NCJx" + }, + "source": [ + "#### Classification report\n" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gqRJRLcKNCJx", + "outputId": "b4b1bbfb-5b76-4936-cb74-e200dc72e1c6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " Irrelevant 0.95 0.87 0.91 2304\n", + " Negative 0.92 0.95 0.93 4024\n", + " Neutral 0.94 0.91 0.93 3169\n", + " Positive 0.90 0.94 0.92 3671\n", + "\n", + " accuracy 0.93 13168\n", + " macro avg 0.93 0.92 0.92 13168\n", + "weighted avg 0.93 0.93 0.92 13168\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "18jz3yhuNCJy" + }, + "source": [ + "### Word2Vec - LSTM\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0gizZeVCNCJy" + }, + "source": [ + "#### Installation of packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Sy0x-OwPNCJy", + "outputId": "9815f9df-920a-48c0-f8c3-c174b2544ee4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: tensorflow in /usr/local/lib/python3.10/dist-packages (2.15.0)\n", + "Requirement already satisfied: absl-py>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.4.0)\n", + "Requirement already satisfied: astunparse>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.6.3)\n", + "Requirement already satisfied: flatbuffers>=23.5.26 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (24.3.25)\n", + "Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.5.4)\n", + "Requirement already satisfied: google-pasta>=0.1.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.2.0)\n", + "Requirement already satisfied: h5py>=2.9.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.9.0)\n", + "Requirement already satisfied: libclang>=13.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (18.1.1)\n", + "Requirement already satisfied: ml-dtypes~=0.2.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.2.0)\n", + "Requirement already satisfied: numpy<2.0.0,>=1.23.5 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.25.2)\n", + "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.3.0)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from tensorflow) (24.0)\n", + "Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (3.20.3)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from tensorflow) (67.7.2)\n", + "Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.16.0)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.4.0)\n", + "Requirement already satisfied: typing-extensions>=3.6.6 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (4.12.1)\n", + "Requirement already satisfied: wrapt<1.15,>=1.11.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.14.1)\n", + "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (0.37.0)\n", + "Requirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (1.64.1)\n", + "Requirement already satisfied: tensorboard<2.16,>=2.15 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.15.2)\n", + "Requirement already satisfied: tensorflow-estimator<2.16,>=2.15.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.15.0)\n", + "Requirement already satisfied: keras<2.16,>=2.15.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow) (2.15.0)\n", + "Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from astunparse>=1.6.0->tensorflow) (0.43.0)\n", + "Requirement already satisfied: google-auth<3,>=1.6.3 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow) (2.27.0)\n", + "Requirement already satisfied: google-auth-oauthlib<2,>=0.5 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow) (1.2.0)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow) (3.6)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow) (2.31.0)\n", + "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow) (0.7.2)\n", + "Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.16,>=2.15->tensorflow) (3.0.3)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow) (5.3.3)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow) (0.4.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow) (4.9)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from google-auth-oauthlib<2,>=0.5->tensorboard<2.16,>=2.15->tensorflow) (1.3.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow) (2024.6.2)\n", + "Requirement already satisfied: MarkupSafe>=2.1.1 in /usr/local/lib/python3.10/dist-packages (from werkzeug>=1.0.1->tensorboard<2.16,>=2.15->tensorflow) (2.1.5)\n", + "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow) (0.6.0)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<2,>=0.5->tensorboard<2.16,>=2.15->tensorflow) (3.2.2)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (1.25.2)\n" + ] + } + ], + "source": [ + "%pip install tensorflow\n", + "%pip install numpy" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nyh33SHPNCJy" + }, + "source": [ + "#### Importing libraries\n" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": { + "id": "WGINcl6pNCJy" + }, + "outputs": [], + "source": [ + "from gensim.models import Word2Vec\n", + "import numpy as np\n", + "import tensorflow as tf\n", + "from sklearn.calibration import LabelEncoder" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JrQ66Il0NCJy" + }, + "source": [ + "#### Function to convert text to Word2Vec vectors\n" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": { + "id": "3MEhNRL0NCJz" + }, + "outputs": [], + "source": [ + "def text_to_vector(text, word2vec, vector_size):\n", + " words = simple_preprocess(text)\n", + " text_vector = np.zeros(vector_size)\n", + " word_count = 0\n", + " for word in words:\n", + " if word in word2vec.wv:\n", + " text_vector += word2vec.wv[word]\n", + " word_count += 1\n", + " if word_count > 0:\n", + " text_vector /= word_count\n", + " return text_vector" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JH_t6_ZtNCJz" + }, + "source": [ + "#### Tokenize texts\n" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": { + "id": "KVzBCEbWNCJz" + }, + "outputs": [], + "source": [ + "tokenized_text = dataset[\"content\"].apply(lambda x: x.split())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o_fLSc_uNCJz" + }, + "source": [ + "#### Vector size parameter\n" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": { + "id": "sLY4J1nTNCJ0" + }, + "outputs": [], + "source": [ + "vector_size = 100" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XIMHHrRqNCJ0" + }, + "source": [ + "#### Train Word2Vec model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": { + "id": "UysosPtiNCJ1" + }, + "outputs": [], + "source": [ + "model_word2vec = Word2Vec(\n", + " tokenized_text, window=5, min_count=2, workers=4, vector_size=vector_size, epochs=20\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xufmoLDlNCJ1" + }, + "source": [ + "#### Convert texts to Word2Vec vectors\n" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": { + "id": "QrY3vXcXNCJ1" + }, + "outputs": [], + "source": [ + "train_vectors = np.array(\n", + " [text_to_vector(text, model_word2vec, vector_size) for text in X_train]\n", + ")\n", + "\n", + "test_vectors = np.array(\n", + " [text_to_vector(text, model_word2vec, vector_size) for text in X_test]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-3hER130NCJ2" + }, + "source": [ + "#### Find the maximum sequence length in the training set\n" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": { + "id": "gcxbUr4lNCJ2" + }, + "outputs": [], + "source": [ + "max_len = max(len(seq) for seq in train_vectors)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ezJ1OadFNCJ3" + }, + "source": [ + "#### Pad sequences to the same length\n" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": { + "id": "1oGWbRZtNCJ3" + }, + "outputs": [], + "source": [ + "X_train_emb = tf.keras.preprocessing.sequence.pad_sequences(\n", + " train_vectors, maxlen=max_len, dtype=\"float32\", padding=\"post\"\n", + ")\n", + "X_test_emb = tf.keras.preprocessing.sequence.pad_sequences(\n", + " test_vectors, maxlen=max_len, dtype=\"float32\", padding=\"post\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LaBGGtm3NCJ4" + }, + "source": [ + "#### Encode labels\n" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": { + "id": "Xe96PgKtNCJ4" + }, + "outputs": [], + "source": [ + "label_encoder = LabelEncoder()\n", + "y_train_enc = label_encoder.fit_transform(y_train)\n", + "y_test_enc = label_encoder.transform(y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P_fI-cZHNCJ4" + }, + "source": [ + "#### Define LSTM model\n" + ] + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "sEUnVQJEP-hy" + } + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": { + "id": "pF5HZSRKNCJ4" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential(\n", + " [\n", + " tf.keras.layers.Embedding(input_dim=X_train_emb.shape[1], output_dim=100),\n", + " tf.keras.layers.LSTM(128),\n", + " tf.keras.layers.Dense(64, activation=\"relu\"),\n", + " tf.keras.layers.Dense(32, activation=\"relu\"),\n", + " tf.keras.layers.Dense(4, activation=\"softmax\"),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YNk8m5lnNCJ4" + }, + "source": [ + "#### Compile the model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": { + "id": "IqDLE1FuNCJ5" + }, + "outputs": [], + "source": [ + "model.compile(\n", + " optimizer=tf.optimizers.Adam(learning_rate=1e-3),\n", + " loss=\"sparse_categorical_crossentropy\",\n", + " metrics=[\"accuracy\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cIRnkLT0NCJ5" + }, + "source": [ + "#### Train the model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QFpStkX9NCJ5", + "outputId": "af188e61-04a4-45e7-e6f3-ef50cba478da" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1/50\n", + "823/823 [==============================] - 10s 9ms/step - loss: 1.3439 - accuracy: 0.3438\n", + "Epoch 2/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.3261 - accuracy: 0.3678\n", + "Epoch 3/50\n", + "823/823 [==============================] - 6s 8ms/step - loss: 1.3163 - accuracy: 0.3774\n", + "Epoch 4/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.3020 - accuracy: 0.3975\n", + "Epoch 5/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.2904 - accuracy: 0.4119\n", + "Epoch 6/50\n", + "823/823 [==============================] - 8s 9ms/step - loss: 1.2814 - accuracy: 0.4186\n", + "Epoch 7/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.2741 - accuracy: 0.4262\n", + "Epoch 8/50\n", + "823/823 [==============================] - 8s 9ms/step - loss: 1.2667 - accuracy: 0.4325\n", + "Epoch 9/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.2588 - accuracy: 0.4372\n", + "Epoch 10/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.2513 - accuracy: 0.4407\n", + "Epoch 11/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.2451 - accuracy: 0.4450\n", + "Epoch 12/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 1.2365 - accuracy: 0.4491\n", + "Epoch 13/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.2291 - accuracy: 0.4560\n", + "Epoch 14/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.2218 - accuracy: 0.4593\n", + "Epoch 15/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.2144 - accuracy: 0.4636\n", + "Epoch 16/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.2066 - accuracy: 0.4669\n", + "Epoch 17/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.1989 - accuracy: 0.4707\n", + "Epoch 18/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.1887 - accuracy: 0.4759\n", + "Epoch 19/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.1810 - accuracy: 0.4803\n", + "Epoch 20/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.1717 - accuracy: 0.4846\n", + "Epoch 21/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.1631 - accuracy: 0.4883\n", + "Epoch 22/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 1.1533 - accuracy: 0.4948\n", + "Epoch 23/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.1426 - accuracy: 0.4983\n", + "Epoch 24/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.1338 - accuracy: 0.5040\n", + "Epoch 25/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.1229 - accuracy: 0.5075\n", + "Epoch 26/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 1.1126 - accuracy: 0.5125\n", + "Epoch 27/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.1042 - accuracy: 0.5167\n", + "Epoch 28/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 1.0920 - accuracy: 0.5237\n", + "Epoch 29/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.0809 - accuracy: 0.5266\n", + "Epoch 30/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 1.0730 - accuracy: 0.5307\n", + "Epoch 31/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.0628 - accuracy: 0.5357\n", + "Epoch 32/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.0536 - accuracy: 0.5422\n", + "Epoch 33/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.0399 - accuracy: 0.5480\n", + "Epoch 34/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.0350 - accuracy: 0.5503\n", + "Epoch 35/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.0237 - accuracy: 0.5553\n", + "Epoch 36/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 1.0217 - accuracy: 0.5550\n", + "Epoch 37/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 1.0073 - accuracy: 0.5633\n", + "Epoch 38/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 0.9927 - accuracy: 0.5703\n", + "Epoch 39/50\n", + "823/823 [==============================] - 6s 8ms/step - loss: 0.9848 - accuracy: 0.5732\n", + "Epoch 40/50\n", + "823/823 [==============================] - 6s 8ms/step - loss: 0.9786 - accuracy: 0.5748\n", + "Epoch 41/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 0.9735 - accuracy: 0.5774\n", + "Epoch 42/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 0.9633 - accuracy: 0.5839\n", + "Epoch 43/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 0.9530 - accuracy: 0.5873\n", + "Epoch 44/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 0.9506 - accuracy: 0.5893\n", + "Epoch 45/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 0.9364 - accuracy: 0.5958\n", + "Epoch 46/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 0.9260 - accuracy: 0.6006\n", + "Epoch 47/50\n", + "823/823 [==============================] - 7s 9ms/step - loss: 0.9257 - accuracy: 0.6008\n", + "Epoch 48/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 0.9155 - accuracy: 0.6048\n", + "Epoch 49/50\n", + "823/823 [==============================] - 7s 8ms/step - loss: 0.9103 - accuracy: 0.6066\n", + "Epoch 50/50\n", + "823/823 [==============================] - 6s 7ms/step - loss: 0.8999 - accuracy: 0.6122\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 155 + } + ], + "source": [ + "model.fit(X_train_emb, y_train_enc, epochs=50, batch_size=64)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CAoGoGZ7NCJ5" + }, + "source": [ + "#### Predicting\n" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LCtJlNP9NCJ5", + "outputId": "15942b31-270a-4817-9b55-f5a48663dbed" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "412/412 [==============================] - 2s 4ms/step\n" + ] + } + ], + "source": [ + "y_pred = model.predict(X_test_emb)\n", + "\n", + "y_preds_argmax = []\n", + "for i in range(len(y_pred)):\n", + " y_preds_argmax.append(y_pred[i].argmax())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ee3GIUHJNCJ6" + }, + "source": [ + "#### Classification report\n" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MMCmZDLgNCJ6", + "outputId": "67ad7bb2-386c-432f-dd95-0c3105a13f0c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.32 0.20 0.25 2304\n", + " 1 0.46 0.62 0.53 4024\n", + " 2 0.44 0.43 0.44 3169\n", + " 3 0.45 0.39 0.42 3671\n", + "\n", + " accuracy 0.44 13168\n", + " macro avg 0.42 0.41 0.41 13168\n", + "weighted avg 0.43 0.44 0.42 13168\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_test_enc, y_preds_argmax))" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file