{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2024-06-10T19:23:48.168059Z", "start_time": "2024-06-10T19:23:48.163587Z" } }, "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "import gensim\n", "from gensim.models import Word2Vec\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score" ], "outputs": [], "execution_count": 17 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:14:28.230207Z", "start_time": "2024-06-10T19:14:10.971636Z" } }, "cell_type": "code", "source": [ "train_df = pd.read_csv('train.csv', header=None, names=['polarity', 'title', 'text'])\n", "test_df = pd.read_csv('test.csv', header=None, names=['polarity', 'title', 'text'])" ], "id": "6b5f49c11e98d496", "outputs": [], "execution_count": 2 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:14:28.820334Z", "start_time": "2024-06-10T19:14:28.231165Z" } }, "cell_type": "code", "source": [ "train_df = train_df.sample(n=40000, random_state=1)\n", "test_df = test_df.sample(n=10000, random_state=1)" ], "id": "d318521180cd6b02", "outputs": [], "execution_count": 3 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:14:28.992617Z", "start_time": "2024-06-10T19:14:28.820334Z" } }, "cell_type": "code", "source": [ "train_df['text'] = train_df['title'].fillna('') + ' ' + train_df['text'].fillna('')\n", "test_df['text'] = test_df['title'].fillna('') + ' ' + test_df['text'].fillna('')\n", "\n", "train_df.drop(columns=['title'], inplace=True)\n", "test_df.drop(columns=['title'], inplace=True)" ], "id": "cc04ff0b71bc8ea4", "outputs": [], "execution_count": 4 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:14:28.998503Z", "start_time": "2024-06-10T19:14:28.993619Z" } }, "cell_type": "code", "source": [ "train_df['polarity'] = train_df['polarity'] - 1\n", "test_df['polarity'] = test_df['polarity'] - 1" ], "id": "6422603a3655706f", "outputs": [], "execution_count": 5 }, { "metadata": { "ExecuteTime": { "end_time": "2024-06-10T19:14:29.010697Z", "start_time": "2024-06-10T19:14:28.999510Z" } }, "cell_type": "code", "source": "train_df", "id": "730673ebe3d09e3f", "outputs": [ { "data": { "text/plain": [ " polarity text\n", "3281328 1 Excellent home help for parents Volume 1 of Do...\n", "2662721 0 Stay far, far away. I made it through about 6,...\n", "1600544 0 Lost Woods Lost WoodsI didn't really understan...\n", "815246 0 Renaissance -12, Women's brown suead shoes Rec...\n", "1254178 1 Best Novel I've Read This Year Intrigued by th...\n", "... ... ...\n", "1132008 1 Pleasant, eclectic mix of coffee-shop favorite...\n", "1712954 1 A Valuable Text This is not light reading. It ...\n", "3191827 0 NOT GOOD TO USE A WORKOUT TO MAKE ADVERTICING ...\n", "1692342 1 Good Read David Wellington brings a new twist ...\n", "1944752 1 Edge of Paradise: America in Micronesia B.C. h...\n", "\n", "[40000 rows x 2 columns]" ], "text/html": [ "
\n", " | polarity | \n", "text | \n", "
---|---|---|
3281328 | \n", "1 | \n", "Excellent home help for parents Volume 1 of Do... | \n", "
2662721 | \n", "0 | \n", "Stay far, far away. I made it through about 6,... | \n", "
1600544 | \n", "0 | \n", "Lost Woods Lost WoodsI didn't really understan... | \n", "
815246 | \n", "0 | \n", "Renaissance -12, Women's brown suead shoes Rec... | \n", "
1254178 | \n", "1 | \n", "Best Novel I've Read This Year Intrigued by th... | \n", "
... | \n", "... | \n", "... | \n", "
1132008 | \n", "1 | \n", "Pleasant, eclectic mix of coffee-shop favorite... | \n", "
1712954 | \n", "1 | \n", "A Valuable Text This is not light reading. It ... | \n", "
3191827 | \n", "0 | \n", "NOT GOOD TO USE A WORKOUT TO MAKE ADVERTICING ... | \n", "
1692342 | \n", "1 | \n", "Good Read David Wellington brings a new twist ... | \n", "
1944752 | \n", "1 | \n", "Edge of Paradise: America in Micronesia B.C. h... | \n", "
40000 rows × 2 columns
\n", "