{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from IPython.display import display,Markdown\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "TRUE_NEWS_PATH = \"data/True.csv\"\n", "FAKE_NEWS_PATH = \"data/Fake.csv\"\n", "\n", "#loading datasets\n", "true_news = pd.read_csv(TRUE_NEWS_PATH)\n", "fake_news = pd.read_csv(FAKE_NEWS_PATH)\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# clearing dataset\n", "true_news = true_news.drop(columns=['title','subject','date'])\n", "\n", "fake_news = fake_news.drop(columns=['title','subject','date'])" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Seting binary classifiaction values\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### True news" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 21417 entries, 0 to 21416\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 text 21417 non-null object\n", " 1 Value 21417 non-null int64 \n", "dtypes: int64(1), object(1)\n", "memory usage: 334.8+ KB\n" ] }, { "data": { "text/plain": [ "None" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textValue
0WASHINGTON (Reuters) - The head of a conservat...1
1WASHINGTON (Reuters) - Transgender people will...1
2WASHINGTON (Reuters) - The special counsel inv...1
3WASHINGTON (Reuters) - Trump campaign adviser ...1
4SEATTLE/WASHINGTON (Reuters) - President Donal...1
5WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...1
6WEST PALM BEACH, Fla (Reuters) - President Don...1
7The following statements were posted to the ve...1
8The following statements were posted to the ve...1
9WASHINGTON (Reuters) - Alabama Secretary of St...1
\n", "
" ], "text/plain": [ " text Value\n", "0 WASHINGTON (Reuters) - The head of a conservat... 1\n", "1 WASHINGTON (Reuters) - Transgender people will... 1\n", "2 WASHINGTON (Reuters) - The special counsel inv... 1\n", "3 WASHINGTON (Reuters) - Trump campaign adviser ... 1\n", "4 SEATTLE/WASHINGTON (Reuters) - President Donal... 1\n", "5 WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T... 1\n", "6 WEST PALM BEACH, Fla (Reuters) - President Don... 1\n", "7 The following statements were posted to the ve... 1\n", "8 The following statements were posted to the ve... 1\n", "9 WASHINGTON (Reuters) - Alabama Secretary of St... 1" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "### Fake news" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 23481 entries, 0 to 23480\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 text 23481 non-null object\n", " 1 Value 23481 non-null int64 \n", "dtypes: int64(1), object(1)\n", "memory usage: 367.0+ KB\n" ] }, { "data": { "text/plain": [ "None" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textValue
0Donald Trump just couldn t wish all Americans ...0
1House Intelligence Committee Chairman Devin Nu...0
2On Friday, it was revealed that former Milwauk...0
3On Christmas day, Donald Trump announced that ...0
4Pope Francis used his annual Christmas Day mes...0
5The number of cases of cops brutalizing and ki...0
6Donald Trump spent a good portion of his day a...0
7In the wake of yet another court decision that...0
8Many people have raised the alarm regarding th...0
9Just when you might have thought we d get a br...0
\n", "
" ], "text/plain": [ " text Value\n", "0 Donald Trump just couldn t wish all Americans ... 0\n", "1 House Intelligence Committee Chairman Devin Nu... 0\n", "2 On Friday, it was revealed that former Milwauk... 0\n", "3 On Christmas day, Donald Trump announced that ... 0\n", "4 Pope Francis used his annual Christmas Day mes... 0\n", "5 The number of cases of cops brutalizing and ki... 0\n", "6 Donald Trump spent a good portion of his day a... 0\n", "7 In the wake of yet another court decision that... 0\n", "8 Many people have raised the alarm regarding th... 0\n", "9 Just when you might have thought we d get a br... 0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "true_news['Value'] = 1\n", "fake_news['Value'] = 0\n", "display(Markdown(r\"### True news\"))\n", "display(true_news.info())\n", "display(true_news.head(10))\n", "display(Markdown(r\"### Fake news\"))\n", "display(fake_news.info())\n", "display(fake_news.head(10))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textValue
0WASHINGTON (Reuters) - The head of a conservat...1
1WASHINGTON (Reuters) - Transgender people will...1
2WASHINGTON (Reuters) - The special counsel inv...1
3WASHINGTON (Reuters) - Trump campaign adviser ...1
4SEATTLE/WASHINGTON (Reuters) - President Donal...1
.........
2347621st Century Wire says As 21WIRE reported earl...0
2347721st Century Wire says It s a familiar theme. ...0
23478Patrick Henningsen 21st Century WireRemember ...0
2347921st Century Wire says Al Jazeera America will...0
2348021st Century Wire says As 21WIRE predicted in ...0
\n", "

44898 rows × 2 columns

\n", "
" ], "text/plain": [ " text Value\n", "0 WASHINGTON (Reuters) - The head of a conservat... 1\n", "1 WASHINGTON (Reuters) - Transgender people will... 1\n", "2 WASHINGTON (Reuters) - The special counsel inv... 1\n", "3 WASHINGTON (Reuters) - Trump campaign adviser ... 1\n", "4 SEATTLE/WASHINGTON (Reuters) - President Donal... 1\n", "... ... ...\n", "23476 21st Century Wire says As 21WIRE reported earl... 0\n", "23477 21st Century Wire says It s a familiar theme. ... 0\n", "23478 Patrick Henningsen 21st Century WireRemember ... 0\n", "23479 21st Century Wire says Al Jazeera America will... 0\n", "23480 21st Century Wire says As 21WIRE predicted in ... 0\n", "\n", "[44898 rows x 2 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# merging dataset\n", "dataset = pd.concat([true_news,fake_news],axis=0)\n", "display(dataset)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 44898 entries, 0 to 23480\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 text 44898 non-null object\n", " 1 Value 44898 non-null int64 \n", "dtypes: int64(1), object(1)\n", "memory usage: 1.0+ MB\n" ] }, { "data": { "text/plain": [ "None" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(dataset.info())" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### STD" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "y_train std: 0.49939397301167954\n", "y_val std: 0.4997839588710888\n", "y_test std: 0.4998194469400359\n" ] }, { "data": { "text/markdown": [ "### MEAN" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "y_train mean: 0.475249178684782\n", "y_val mean: 0.4835189309576837\n", "y_test mean: 0.4846325167037862\n" ] }, { "data": { "text/markdown": [ "### Count" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "y_train count: 35918\n", "y_val count: 4490\n", "y_test count: 4490\n" ] } ], "source": [ "# creating train, val , test datasets dataset 8:1:1\n", "X_train, X_val_test, y_train, y_valtest = train_test_split(dataset[\"text\"],dataset[\"Value\"],test_size=0.2, shuffle=True)\n", "X_test, X_val, y_test, y_val = train_test_split(X_val_test,y_valtest,test_size=0.5, shuffle=True)\n", "display(Markdown(\"### STD\"))\n", "print(f\"y_train std: {y_train.std()}\")\n", "print(f\"y_val std: {y_val.std()}\")\n", "print(f\"y_test std: {y_test.std()}\")\n", "\n", "display(Markdown(\"### MEAN\"))\n", "print(f\"y_train mean: {y_train.mean()}\")\n", "print(f\"y_val mean: {y_val.mean()}\")\n", "print(f\"y_test mean: {y_test.mean()}\")\n", "\n", "display(Markdown(\"### Count\"))\n", "print(f\"y_train count: {y_train.count()}\")\n", "print(f\"y_val count: {y_val.count()}\")\n", "print(f\"y_test count: {y_test.count()}\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "dl", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "6e9239598a6712340c2b580c5c929949b8a813e86738fb7cf0a67c11d0863b74" } } }, "nbformat": 4, "nbformat_minor": 2 }