ium_424714/dane.ipynb

602 lines
17 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from IPython.display import display,Markdown\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"TRUE_NEWS_PATH = \"data/True.csv\"\n",
"FAKE_NEWS_PATH = \"data/Fake.csv\"\n",
"\n",
"#loading datasets\n",
"true_news = pd.read_csv(TRUE_NEWS_PATH)\n",
"fake_news = pd.read_csv(FAKE_NEWS_PATH)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# clearing dataset\n",
"true_news = true_news.drop(columns=['title','subject','date'])\n",
"\n",
"fake_news = fake_news.drop(columns=['title','subject','date'])"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Seting binary classifiaction values\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"### True news"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 21417 entries, 0 to 21416\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 text 21417 non-null object\n",
" 1 Value 21417 non-null int64 \n",
"dtypes: int64(1), object(1)\n",
"memory usage: 334.8+ KB\n"
]
},
{
"data": {
"text/plain": [
"None"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>Value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>WASHINGTON (Reuters) - The head of a conservat...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>WASHINGTON (Reuters) - Transgender people will...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>WASHINGTON (Reuters) - The special counsel inv...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>WASHINGTON (Reuters) - Trump campaign adviser ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>SEATTLE/WASHINGTON (Reuters) - President Donal...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>WEST PALM BEACH, Fla (Reuters) - President Don...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>The following statements were posted to the ve...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>The following statements were posted to the ve...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>WASHINGTON (Reuters) - Alabama Secretary of St...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text Value\n",
"0 WASHINGTON (Reuters) - The head of a conservat... 1\n",
"1 WASHINGTON (Reuters) - Transgender people will... 1\n",
"2 WASHINGTON (Reuters) - The special counsel inv... 1\n",
"3 WASHINGTON (Reuters) - Trump campaign adviser ... 1\n",
"4 SEATTLE/WASHINGTON (Reuters) - President Donal... 1\n",
"5 WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T... 1\n",
"6 WEST PALM BEACH, Fla (Reuters) - President Don... 1\n",
"7 The following statements were posted to the ve... 1\n",
"8 The following statements were posted to the ve... 1\n",
"9 WASHINGTON (Reuters) - Alabama Secretary of St... 1"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"### Fake news"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 23481 entries, 0 to 23480\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 text 23481 non-null object\n",
" 1 Value 23481 non-null int64 \n",
"dtypes: int64(1), object(1)\n",
"memory usage: 367.0+ KB\n"
]
},
{
"data": {
"text/plain": [
"None"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>Value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Donald Trump just couldn t wish all Americans ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>House Intelligence Committee Chairman Devin Nu...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>On Friday, it was revealed that former Milwauk...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>On Christmas day, Donald Trump announced that ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pope Francis used his annual Christmas Day mes...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>The number of cases of cops brutalizing and ki...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Donald Trump spent a good portion of his day a...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>In the wake of yet another court decision that...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Many people have raised the alarm regarding th...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Just when you might have thought we d get a br...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text Value\n",
"0 Donald Trump just couldn t wish all Americans ... 0\n",
"1 House Intelligence Committee Chairman Devin Nu... 0\n",
"2 On Friday, it was revealed that former Milwauk... 0\n",
"3 On Christmas day, Donald Trump announced that ... 0\n",
"4 Pope Francis used his annual Christmas Day mes... 0\n",
"5 The number of cases of cops brutalizing and ki... 0\n",
"6 Donald Trump spent a good portion of his day a... 0\n",
"7 In the wake of yet another court decision that... 0\n",
"8 Many people have raised the alarm regarding th... 0\n",
"9 Just when you might have thought we d get a br... 0"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"true_news['Value'] = 1\n",
"fake_news['Value'] = 0\n",
"display(Markdown(r\"### True news\"))\n",
"display(true_news.info())\n",
"display(true_news.head(10))\n",
"display(Markdown(r\"### Fake news\"))\n",
"display(fake_news.info())\n",
"display(fake_news.head(10))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>Value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>WASHINGTON (Reuters) - The head of a conservat...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>WASHINGTON (Reuters) - Transgender people will...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>WASHINGTON (Reuters) - The special counsel inv...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>WASHINGTON (Reuters) - Trump campaign adviser ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>SEATTLE/WASHINGTON (Reuters) - President Donal...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23476</th>\n",
" <td>21st Century Wire says As 21WIRE reported earl...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23477</th>\n",
" <td>21st Century Wire says It s a familiar theme. ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23478</th>\n",
" <td>Patrick Henningsen 21st Century WireRemember ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23479</th>\n",
" <td>21st Century Wire says Al Jazeera America will...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23480</th>\n",
" <td>21st Century Wire says As 21WIRE predicted in ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>44898 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" text Value\n",
"0 WASHINGTON (Reuters) - The head of a conservat... 1\n",
"1 WASHINGTON (Reuters) - Transgender people will... 1\n",
"2 WASHINGTON (Reuters) - The special counsel inv... 1\n",
"3 WASHINGTON (Reuters) - Trump campaign adviser ... 1\n",
"4 SEATTLE/WASHINGTON (Reuters) - President Donal... 1\n",
"... ... ...\n",
"23476 21st Century Wire says As 21WIRE reported earl... 0\n",
"23477 21st Century Wire says It s a familiar theme. ... 0\n",
"23478 Patrick Henningsen 21st Century WireRemember ... 0\n",
"23479 21st Century Wire says Al Jazeera America will... 0\n",
"23480 21st Century Wire says As 21WIRE predicted in ... 0\n",
"\n",
"[44898 rows x 2 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# merging dataset\n",
"dataset = pd.concat([true_news,fake_news],axis=0)\n",
"display(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 44898 entries, 0 to 23480\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 text 44898 non-null object\n",
" 1 Value 44898 non-null int64 \n",
"dtypes: int64(1), object(1)\n",
"memory usage: 1.0+ MB\n"
]
},
{
"data": {
"text/plain": [
"None"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(dataset.info())"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"### STD"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"y_train std: 0.49939397301167954\n",
"y_val std: 0.4997839588710888\n",
"y_test std: 0.4998194469400359\n"
]
},
{
"data": {
"text/markdown": [
"### MEAN"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"y_train mean: 0.475249178684782\n",
"y_val mean: 0.4835189309576837\n",
"y_test mean: 0.4846325167037862\n"
]
},
{
"data": {
"text/markdown": [
"### Count"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"y_train count: 35918\n",
"y_val count: 4490\n",
"y_test count: 4490\n"
]
}
],
"source": [
"# creating train, val , test datasets dataset 8:1:1\n",
"X_train, X_val_test, y_train, y_valtest = train_test_split(dataset[\"text\"],dataset[\"Value\"],test_size=0.2, shuffle=True)\n",
"X_test, X_val, y_test, y_val = train_test_split(X_val_test,y_valtest,test_size=0.5, shuffle=True)\n",
"display(Markdown(\"### STD\"))\n",
"print(f\"y_train std: {y_train.std()}\")\n",
"print(f\"y_val std: {y_val.std()}\")\n",
"print(f\"y_test std: {y_test.std()}\")\n",
"\n",
"display(Markdown(\"### MEAN\"))\n",
"print(f\"y_train mean: {y_train.mean()}\")\n",
"print(f\"y_val mean: {y_val.mean()}\")\n",
"print(f\"y_test mean: {y_test.mean()}\")\n",
"\n",
"display(Markdown(\"### Count\"))\n",
"print(f\"y_train count: {y_train.count()}\")\n",
"print(f\"y_val count: {y_val.count()}\")\n",
"print(f\"y_test count: {y_test.count()}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "dl",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "6e9239598a6712340c2b580c5c929949b8a813e86738fb7cf0a67c11d0863b74"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}