DL_projekt/projekt.ipynb
2024-06-10 21:53:20 +02:00

435 lines
13 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-06-10T19:23:48.168059Z",
"start_time": "2024-06-10T19:23:48.163587Z"
}
},
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"import gensim\n",
"from gensim.models import Word2Vec\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score"
],
"outputs": [],
"execution_count": 17
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-10T19:14:28.230207Z",
"start_time": "2024-06-10T19:14:10.971636Z"
}
},
"cell_type": "code",
"source": [
"train_df = pd.read_csv('train.csv', header=None, names=['polarity', 'title', 'text'])\n",
"test_df = pd.read_csv('test.csv', header=None, names=['polarity', 'title', 'text'])"
],
"id": "6b5f49c11e98d496",
"outputs": [],
"execution_count": 2
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-10T19:14:28.820334Z",
"start_time": "2024-06-10T19:14:28.231165Z"
}
},
"cell_type": "code",
"source": [
"train_df = train_df.sample(n=40000, random_state=1)\n",
"test_df = test_df.sample(n=10000, random_state=1)"
],
"id": "d318521180cd6b02",
"outputs": [],
"execution_count": 3
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-10T19:14:28.992617Z",
"start_time": "2024-06-10T19:14:28.820334Z"
}
},
"cell_type": "code",
"source": [
"train_df['text'] = train_df['title'].fillna('') + ' ' + train_df['text'].fillna('')\n",
"test_df['text'] = test_df['title'].fillna('') + ' ' + test_df['text'].fillna('')\n",
"\n",
"train_df.drop(columns=['title'], inplace=True)\n",
"test_df.drop(columns=['title'], inplace=True)"
],
"id": "cc04ff0b71bc8ea4",
"outputs": [],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-10T19:14:28.998503Z",
"start_time": "2024-06-10T19:14:28.993619Z"
}
},
"cell_type": "code",
"source": [
"train_df['polarity'] = train_df['polarity'] - 1\n",
"test_df['polarity'] = test_df['polarity'] - 1"
],
"id": "6422603a3655706f",
"outputs": [],
"execution_count": 5
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-10T19:14:29.010697Z",
"start_time": "2024-06-10T19:14:28.999510Z"
}
},
"cell_type": "code",
"source": "train_df",
"id": "730673ebe3d09e3f",
"outputs": [
{
"data": {
"text/plain": [
" polarity text\n",
"3281328 1 Excellent home help for parents Volume 1 of Do...\n",
"2662721 0 Stay far, far away. I made it through about 6,...\n",
"1600544 0 Lost Woods Lost WoodsI didn't really understan...\n",
"815246 0 Renaissance -12, Women's brown suead shoes Rec...\n",
"1254178 1 Best Novel I've Read This Year Intrigued by th...\n",
"... ... ...\n",
"1132008 1 Pleasant, eclectic mix of coffee-shop favorite...\n",
"1712954 1 A Valuable Text This is not light reading. It ...\n",
"3191827 0 NOT GOOD TO USE A WORKOUT TO MAKE ADVERTICING ...\n",
"1692342 1 Good Read David Wellington brings a new twist ...\n",
"1944752 1 Edge of Paradise: America in Micronesia B.C. h...\n",
"\n",
"[40000 rows x 2 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>polarity</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3281328</th>\n",
" <td>1</td>\n",
" <td>Excellent home help for parents Volume 1 of Do...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2662721</th>\n",
" <td>0</td>\n",
" <td>Stay far, far away. I made it through about 6,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1600544</th>\n",
" <td>0</td>\n",
" <td>Lost Woods Lost WoodsI didn't really understan...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>815246</th>\n",
" <td>0</td>\n",
" <td>Renaissance -12, Women's brown suead shoes Rec...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1254178</th>\n",
" <td>1</td>\n",
" <td>Best Novel I've Read This Year Intrigued by th...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1132008</th>\n",
" <td>1</td>\n",
" <td>Pleasant, eclectic mix of coffee-shop favorite...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1712954</th>\n",
" <td>1</td>\n",
" <td>A Valuable Text This is not light reading. It ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3191827</th>\n",
" <td>0</td>\n",
" <td>NOT GOOD TO USE A WORKOUT TO MAKE ADVERTICING ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1692342</th>\n",
" <td>1</td>\n",
" <td>Good Read David Wellington brings a new twist ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1944752</th>\n",
" <td>1</td>\n",
" <td>Edge of Paradise: America in Micronesia B.C. h...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>40000 rows × 2 columns</p>\n",
"</div>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 6
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-10T19:14:31.756539Z",
"start_time": "2024-06-10T19:14:29.011700Z"
}
},
"cell_type": "code",
"source": [
"# TF-IDF\n",
"tfidf_vectorizer = TfidfVectorizer(max_features=10000)\n",
"X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])\n",
"X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])"
],
"id": "97c3ff76588ab69c",
"outputs": [],
"execution_count": 7
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-10T19:14:43.754163Z",
"start_time": "2024-06-10T19:14:31.756932Z"
}
},
"cell_type": "code",
"source": [
"# Word2Vec\n",
"def tokenize(text):\n",
" return text.split()\n",
"\n",
"train_df['tokens'] = train_df['text'].apply(tokenize)\n",
"test_df['tokens'] = test_df['text'].apply(tokenize)\n",
"\n",
"w2v_model = Word2Vec(sentences=train_df['tokens'], vector_size=100, window=5, min_count=5, workers=4)\n",
"\n",
"def get_avg_w2v(tokens, model):\n",
" vectors = [model.wv[token] for token in tokens if token in model.wv]\n",
" if len(vectors) == 0:\n",
" return np.zeros(model.vector_size)\n",
" return np.mean(vectors, axis=0)\n",
"\n",
"X_train_w2v = np.array([get_avg_w2v(tokens, w2v_model) for tokens in train_df['tokens']])\n",
"X_test_w2v = np.array([get_avg_w2v(tokens, w2v_model) for tokens in test_df['tokens']])"
],
"id": "d5d352229d623c1a",
"outputs": [],
"execution_count": 8
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-10T19:14:43.760194Z",
"start_time": "2024-06-10T19:14:43.755159Z"
}
},
"cell_type": "code",
"source": [
"# Klasyfikatory\n",
"log_reg = LogisticRegression(max_iter=1000)\n",
"rf_clf = RandomForestClassifier(n_estimators=100, random_state=1)"
],
"id": "e3879b5ac92d0ad",
"outputs": [],
"execution_count": 9
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-10T19:16:03.926892Z",
"start_time": "2024-06-10T19:14:43.761179Z"
}
},
"cell_type": "code",
"source": [
"# TF-IDF\n",
"log_reg.fit(X_train_tfidf, train_df['polarity'])\n",
"rf_clf.fit(X_train_tfidf, train_df['polarity'])\n",
"\n",
"y_pred_log_reg_tfidf = log_reg.predict(X_test_tfidf)\n",
"y_pred_rf_clf_tfidf = rf_clf.predict(X_test_tfidf)"
],
"id": "163ddf27adbeecdf",
"outputs": [],
"execution_count": 10
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-10T19:16:46.644728Z",
"start_time": "2024-06-10T19:16:03.928399Z"
}
},
"cell_type": "code",
"source": [
"# Word2Vec\n",
"log_reg.fit(X_train_w2v, train_df['polarity'])\n",
"rf_clf.fit(X_train_w2v, train_df['polarity'])\n",
"\n",
"y_pred_log_reg_w2v = log_reg.predict(X_test_w2v)\n",
"y_pred_rf_clf_w2v = rf_clf.predict(X_test_w2v)"
],
"id": "5f3ed725746bfeee",
"outputs": [],
"execution_count": 11
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-10T19:23:29.664360Z",
"start_time": "2024-06-10T19:23:29.659366Z"
}
},
"cell_type": "code",
"source": [
"def display_metrics(y_true, y_pred, name):\n",
" accuracy = accuracy_score(y_true, y_pred)\n",
" precision = precision_score(y_true, y_pred)\n",
" recall = recall_score(y_true, y_pred)\n",
" f1 = f1_score(y_true, y_pred)\n",
" print(f\"{name} Metrics:\")\n",
" print(f\"Accuracy: {accuracy:.4f}\")\n",
" print(f\"Precision: {precision:.4f}\")\n",
" print(f\"Recall: {recall:.4f}\")\n",
" print(f\"F1-score: {f1:.4f}\")\n",
" print()\n"
],
"id": "427a5c4c145ab33f",
"outputs": [],
"execution_count": 15
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-10T19:23:51.473027Z",
"start_time": "2024-06-10T19:23:51.420072Z"
}
},
"cell_type": "code",
"source": [
"# TF-IDF + Regresja Logistyczna\n",
"display_metrics(test_df['polarity'], y_pred_log_reg_tfidf, \"TF-IDF + Logistic Regression\")\n",
"\n",
"# TF-IDF + Random Forest\n",
"display_metrics(test_df['polarity'], y_pred_rf_clf_tfidf, \"TF-IDF + Random Forest\")\n",
"\n",
"# Word2Vec + Regresja Logistyczna\n",
"display_metrics(test_df['polarity'], y_pred_log_reg_w2v, \"Word2Vec + Logistic Regression\")\n",
"\n",
"# Word2Vec + Random Forest\n",
"display_metrics(test_df['polarity'], y_pred_rf_clf_w2v, \"Word2Vec + Random Forest\")"
],
"id": "118be083db9e3ed2",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TF-IDF + Logistic Regression Metrics:\n",
"Accuracy: 0.8865\n",
"Precision: 0.8851\n",
"Recall: 0.8898\n",
"F1-score: 0.8874\n",
"\n",
"TF-IDF + Random Forest Metrics:\n",
"Accuracy: 0.8504\n",
"Precision: 0.8668\n",
"Recall: 0.8300\n",
"F1-score: 0.8480\n",
"\n",
"Word2Vec + Logistic Regression Metrics:\n",
"Accuracy: 0.7906\n",
"Precision: 0.7964\n",
"Recall: 0.7840\n",
"F1-score: 0.7901\n",
"\n",
"Word2Vec + Random Forest Metrics:\n",
"Accuracy: 0.7546\n",
"Precision: 0.7643\n",
"Recall: 0.7403\n",
"F1-score: 0.7521\n",
"\n"
]
}
],
"execution_count": 18
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "",
"id": "572452dcd6ba100b"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}