ium_424714/dane.ipynb

512 lines
15 KiB
Plaintext
Raw Normal View History

2023-03-21 17:37:51 +01:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from IPython.display import display,Markdown"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"TRUE_NEWS_PATH = \"data/True.csv\"\n",
"FAKE_NEWS_PATH = \"data/Fake.csv\"\n",
"\n",
"#loading datasets\n",
"true_news = pd.read_csv(TRUE_NEWS_PATH)\n",
"fake_news = pd.read_csv(FAKE_NEWS_PATH)\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# clearing dataset\n",
"true_news = true_news.drop(columns=['title','subject','date'])\n",
"\n",
"fake_news = fake_news.drop(columns=['title','subject','date'])"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Seting binary classifiaction values\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"### True news"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 21417 entries, 0 to 21416\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 text 21417 non-null object\n",
" 1 Value 21417 non-null int64 \n",
"dtypes: int64(1), object(1)\n",
"memory usage: 334.8+ KB\n"
]
},
{
"data": {
"text/plain": [
"None"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>Value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>WASHINGTON (Reuters) - The head of a conservat...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>WASHINGTON (Reuters) - Transgender people will...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>WASHINGTON (Reuters) - The special counsel inv...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>WASHINGTON (Reuters) - Trump campaign adviser ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>SEATTLE/WASHINGTON (Reuters) - President Donal...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>WEST PALM BEACH, Fla (Reuters) - President Don...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>The following statements were posted to the ve...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>The following statements were posted to the ve...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>WASHINGTON (Reuters) - Alabama Secretary of St...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text Value\n",
"0 WASHINGTON (Reuters) - The head of a conservat... 1\n",
"1 WASHINGTON (Reuters) - Transgender people will... 1\n",
"2 WASHINGTON (Reuters) - The special counsel inv... 1\n",
"3 WASHINGTON (Reuters) - Trump campaign adviser ... 1\n",
"4 SEATTLE/WASHINGTON (Reuters) - President Donal... 1\n",
"5 WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T... 1\n",
"6 WEST PALM BEACH, Fla (Reuters) - President Don... 1\n",
"7 The following statements were posted to the ve... 1\n",
"8 The following statements were posted to the ve... 1\n",
"9 WASHINGTON (Reuters) - Alabama Secretary of St... 1"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"### Fake news"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 23481 entries, 0 to 23480\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 text 23481 non-null object\n",
" 1 Value 23481 non-null int64 \n",
"dtypes: int64(1), object(1)\n",
"memory usage: 367.0+ KB\n"
]
},
{
"data": {
"text/plain": [
"None"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>Value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Donald Trump just couldn t wish all Americans ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>House Intelligence Committee Chairman Devin Nu...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>On Friday, it was revealed that former Milwauk...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>On Christmas day, Donald Trump announced that ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Pope Francis used his annual Christmas Day mes...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>The number of cases of cops brutalizing and ki...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Donald Trump spent a good portion of his day a...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>In the wake of yet another court decision that...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Many people have raised the alarm regarding th...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Just when you might have thought we d get a br...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text Value\n",
"0 Donald Trump just couldn t wish all Americans ... 0\n",
"1 House Intelligence Committee Chairman Devin Nu... 0\n",
"2 On Friday, it was revealed that former Milwauk... 0\n",
"3 On Christmas day, Donald Trump announced that ... 0\n",
"4 Pope Francis used his annual Christmas Day mes... 0\n",
"5 The number of cases of cops brutalizing and ki... 0\n",
"6 Donald Trump spent a good portion of his day a... 0\n",
"7 In the wake of yet another court decision that... 0\n",
"8 Many people have raised the alarm regarding th... 0\n",
"9 Just when you might have thought we d get a br... 0"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"true_news['Value'] = 1\n",
"fake_news['Value'] = 0\n",
"display(Markdown(r\"### True news\"))\n",
"display(true_news.info())\n",
"display(true_news.head(10))\n",
"display(Markdown(r\"### Fake news\"))\n",
"display(fake_news.info())\n",
"display(fake_news.head(10))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>Value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>WASHINGTON (Reuters) - The head of a conservat...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>WASHINGTON (Reuters) - Transgender people will...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>WASHINGTON (Reuters) - The special counsel inv...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>WASHINGTON (Reuters) - Trump campaign adviser ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>SEATTLE/WASHINGTON (Reuters) - President Donal...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23476</th>\n",
" <td>21st Century Wire says As 21WIRE reported earl...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23477</th>\n",
" <td>21st Century Wire says It s a familiar theme. ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23478</th>\n",
" <td>Patrick Henningsen 21st Century WireRemember ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23479</th>\n",
" <td>21st Century Wire says Al Jazeera America will...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23480</th>\n",
" <td>21st Century Wire says As 21WIRE predicted in ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>44898 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" text Value\n",
"0 WASHINGTON (Reuters) - The head of a conservat... 1\n",
"1 WASHINGTON (Reuters) - Transgender people will... 1\n",
"2 WASHINGTON (Reuters) - The special counsel inv... 1\n",
"3 WASHINGTON (Reuters) - Trump campaign adviser ... 1\n",
"4 SEATTLE/WASHINGTON (Reuters) - President Donal... 1\n",
"... ... ...\n",
"23476 21st Century Wire says As 21WIRE reported earl... 0\n",
"23477 21st Century Wire says It s a familiar theme. ... 0\n",
"23478 Patrick Henningsen 21st Century WireRemember ... 0\n",
"23479 21st Century Wire says Al Jazeera America will... 0\n",
"23480 21st Century Wire says As 21WIRE predicted in ... 0\n",
"\n",
"[44898 rows x 2 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# merging dataset\n",
"dataset = pd.concat([true_news,fake_news],axis=0)\n",
"display(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 44898 entries, 0 to 23480\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 text 44898 non-null object\n",
" 1 Value 44898 non-null int64 \n",
"dtypes: int64(1), object(1)\n",
"memory usage: 1.0+ MB\n"
]
},
{
"data": {
"text/plain": [
"None"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(dataset.info())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "1e61067c2f2e27a88e433eed08bcab15943261b719f4667f6d0d352911f3557f"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}