{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2024-06-10T19:23:48.168059Z", "start_time": "2024-06-10T19:23:48.163587Z" } }, "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "import gensim\n", "from gensim.models import Word2Vec\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score" ], "outputs": [], "execution_count": 17 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:14:28.230207Z", "start_time": "2024-06-10T19:14:10.971636Z" } }, "cell_type": "code", "source": [ "train_df = pd.read_csv('train.csv', header=None, names=['polarity', 'title', 'text'])\n", "test_df = pd.read_csv('test.csv', header=None, names=['polarity', 'title', 'text'])" ], "id": "6b5f49c11e98d496", "outputs": [], "execution_count": 2 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:14:28.820334Z", "start_time": "2024-06-10T19:14:28.231165Z" } }, "cell_type": "code", "source": [ "train_df = train_df.sample(n=40000, random_state=1)\n", "test_df = test_df.sample(n=10000, random_state=1)" ], "id": "d318521180cd6b02", "outputs": [], "execution_count": 3 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:14:28.992617Z", "start_time": "2024-06-10T19:14:28.820334Z" } }, "cell_type": "code", "source": [ "train_df['text'] = train_df['title'].fillna('') + ' ' + train_df['text'].fillna('')\n", "test_df['text'] = test_df['title'].fillna('') + ' ' + test_df['text'].fillna('')\n", "\n", "train_df.drop(columns=['title'], inplace=True)\n", "test_df.drop(columns=['title'], inplace=True)" ], "id": "cc04ff0b71bc8ea4", "outputs": [], "execution_count": 4 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:14:28.998503Z", "start_time": "2024-06-10T19:14:28.993619Z" } }, "cell_type": "code", "source": [ "train_df['polarity'] = train_df['polarity'] - 1\n", "test_df['polarity'] = test_df['polarity'] - 1" ], "id": "6422603a3655706f", "outputs": [], "execution_count": 5 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:14:29.010697Z", "start_time": "2024-06-10T19:14:28.999510Z" } }, "cell_type": "code", "source": "train_df", "id": "730673ebe3d09e3f", "outputs": [ { "data": { "text/plain": [ " polarity text\n", "3281328 1 Excellent home help for parents Volume 1 of Do...\n", "2662721 0 Stay far, far away. I made it through about 6,...\n", "1600544 0 Lost Woods Lost WoodsI didn't really understan...\n", "815246 0 Renaissance -12, Women's brown suead shoes Rec...\n", "1254178 1 Best Novel I've Read This Year Intrigued by th...\n", "... ... ...\n", "1132008 1 Pleasant, eclectic mix of coffee-shop favorite...\n", "1712954 1 A Valuable Text This is not light reading. It ...\n", "3191827 0 NOT GOOD TO USE A WORKOUT TO MAKE ADVERTICING ...\n", "1692342 1 Good Read David Wellington brings a new twist ...\n", "1944752 1 Edge of Paradise: America in Micronesia B.C. h...\n", "\n", "[40000 rows x 2 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
polaritytext
32813281Excellent home help for parents Volume 1 of Do...
26627210Stay far, far away. I made it through about 6,...
16005440Lost Woods Lost WoodsI didn't really understan...
8152460Renaissance -12, Women's brown suead shoes Rec...
12541781Best Novel I've Read This Year Intrigued by th...
.........
11320081Pleasant, eclectic mix of coffee-shop favorite...
17129541A Valuable Text This is not light reading. It ...
31918270NOT GOOD TO USE A WORKOUT TO MAKE ADVERTICING ...
16923421Good Read David Wellington brings a new twist ...
19447521Edge of Paradise: America in Micronesia B.C. h...
\n", "

40000 rows × 2 columns

\n", "
" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 6 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:14:31.756539Z", "start_time": "2024-06-10T19:14:29.011700Z" } }, "cell_type": "code", "source": [ "# TF-IDF\n", "tfidf_vectorizer = TfidfVectorizer(max_features=10000)\n", "X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])\n", "X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])" ], "id": "97c3ff76588ab69c", "outputs": [], "execution_count": 7 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:14:43.754163Z", "start_time": "2024-06-10T19:14:31.756932Z" } }, "cell_type": "code", "source": [ "# Word2Vec\n", "def tokenize(text):\n", " return text.split()\n", "\n", "train_df['tokens'] = train_df['text'].apply(tokenize)\n", "test_df['tokens'] = test_df['text'].apply(tokenize)\n", "\n", "w2v_model = Word2Vec(sentences=train_df['tokens'], vector_size=100, window=5, min_count=5, workers=4)\n", "\n", "def get_avg_w2v(tokens, model):\n", " vectors = [model.wv[token] for token in tokens if token in model.wv]\n", " if len(vectors) == 0:\n", " return np.zeros(model.vector_size)\n", " return np.mean(vectors, axis=0)\n", "\n", "X_train_w2v = np.array([get_avg_w2v(tokens, w2v_model) for tokens in train_df['tokens']])\n", "X_test_w2v = np.array([get_avg_w2v(tokens, w2v_model) for tokens in test_df['tokens']])" ], "id": "d5d352229d623c1a", "outputs": [], "execution_count": 8 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:14:43.760194Z", "start_time": "2024-06-10T19:14:43.755159Z" } }, "cell_type": "code", "source": [ "# Klasyfikatory\n", "log_reg = LogisticRegression(max_iter=1000)\n", "rf_clf = RandomForestClassifier(n_estimators=100, random_state=1)" ], "id": "e3879b5ac92d0ad", "outputs": [], "execution_count": 9 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:16:03.926892Z", "start_time": "2024-06-10T19:14:43.761179Z" } }, "cell_type": "code", "source": [ "# TF-IDF\n", "log_reg.fit(X_train_tfidf, train_df['polarity'])\n", "rf_clf.fit(X_train_tfidf, train_df['polarity'])\n", "\n", "y_pred_log_reg_tfidf = log_reg.predict(X_test_tfidf)\n", "y_pred_rf_clf_tfidf = rf_clf.predict(X_test_tfidf)" ], "id": "163ddf27adbeecdf", "outputs": [], "execution_count": 10 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:16:46.644728Z", "start_time": "2024-06-10T19:16:03.928399Z" } }, "cell_type": "code", "source": [ "# Word2Vec\n", "log_reg.fit(X_train_w2v, train_df['polarity'])\n", "rf_clf.fit(X_train_w2v, train_df['polarity'])\n", "\n", "y_pred_log_reg_w2v = log_reg.predict(X_test_w2v)\n", "y_pred_rf_clf_w2v = rf_clf.predict(X_test_w2v)" ], "id": "5f3ed725746bfeee", "outputs": [], "execution_count": 11 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:23:29.664360Z", "start_time": "2024-06-10T19:23:29.659366Z" } }, "cell_type": "code", "source": [ "def display_metrics(y_true, y_pred, name):\n", " accuracy = accuracy_score(y_true, y_pred)\n", " precision = precision_score(y_true, y_pred)\n", " recall = recall_score(y_true, y_pred)\n", " f1 = f1_score(y_true, y_pred)\n", " print(f\"{name} Metrics:\")\n", " print(f\"Accuracy: {accuracy:.4f}\")\n", " print(f\"Precision: {precision:.4f}\")\n", " print(f\"Recall: {recall:.4f}\")\n", " print(f\"F1-score: {f1:.4f}\")\n", " print()\n" ], "id": "427a5c4c145ab33f", "outputs": [], "execution_count": 15 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:23:51.473027Z", "start_time": "2024-06-10T19:23:51.420072Z" } }, "cell_type": "code", "source": [ "# TF-IDF + Regresja Logistyczna\n", "display_metrics(test_df['polarity'], y_pred_log_reg_tfidf, \"TF-IDF + Logistic Regression\")\n", "\n", "# TF-IDF + Random Forest\n", "display_metrics(test_df['polarity'], y_pred_rf_clf_tfidf, \"TF-IDF + Random Forest\")\n", "\n", "# Word2Vec + Regresja Logistyczna\n", "display_metrics(test_df['polarity'], y_pred_log_reg_w2v, \"Word2Vec + Logistic Regression\")\n", "\n", "# Word2Vec + Random Forest\n", "display_metrics(test_df['polarity'], y_pred_rf_clf_w2v, \"Word2Vec + Random Forest\")" ], "id": "118be083db9e3ed2", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TF-IDF + Logistic Regression Metrics:\n", "Accuracy: 0.8865\n", "Precision: 0.8851\n", "Recall: 0.8898\n", "F1-score: 0.8874\n", "\n", "TF-IDF + Random Forest Metrics:\n", "Accuracy: 0.8504\n", "Precision: 0.8668\n", "Recall: 0.8300\n", "F1-score: 0.8480\n", "\n", "Word2Vec + Logistic Regression Metrics:\n", "Accuracy: 0.7906\n", "Precision: 0.7964\n", "Recall: 0.7840\n", "F1-score: 0.7901\n", "\n", "Word2Vec + Random Forest Metrics:\n", "Accuracy: 0.7546\n", "Precision: 0.7643\n", "Recall: 0.7403\n", "F1-score: 0.7521\n", "\n" ] } ], "execution_count": 18 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "572452dcd6ba100b" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }