ium_424714/dane.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from IPython.display import display,Markdown\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "TRUE_NEWS_PATH = \"data/True.csv\"\n",
    "FAKE_NEWS_PATH = \"data/Fake.csv\"\n",
    "\n",
    "#loading datasets\n",
    "true_news = pd.read_csv(TRUE_NEWS_PATH)\n",
    "fake_news = pd.read_csv(FAKE_NEWS_PATH)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# clearing dataset\n",
    "true_news = true_news.drop(columns=['title','subject','date'])\n",
    "\n",
    "fake_news = fake_news.drop(columns=['title','subject','date'])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Seting binary classifiaction values\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "### True news"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 21417 entries, 0 to 21416\n",
      "Data columns (total 2 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   text    21417 non-null  object\n",
      " 1   Value   21417 non-null  int64 \n",
      "dtypes: int64(1), object(1)\n",
      "memory usage: 334.8+ KB\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "None"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>Value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>WASHINGTON (Reuters) - The head of a conservat...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>WASHINGTON (Reuters) - Transgender people will...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>WASHINGTON (Reuters) - The special counsel inv...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>WASHINGTON (Reuters) - Trump campaign adviser ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>SEATTLE/WASHINGTON (Reuters) - President Donal...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>WEST PALM BEACH, Fla (Reuters) - President Don...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>The following statements were posted to the ve...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>The following statements were posted to the ve...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>WASHINGTON (Reuters) - Alabama Secretary of St...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text  Value\n",
       "0  WASHINGTON (Reuters) - The head of a conservat...      1\n",
       "1  WASHINGTON (Reuters) - Transgender people will...      1\n",
       "2  WASHINGTON (Reuters) - The special counsel inv...      1\n",
       "3  WASHINGTON (Reuters) - Trump campaign adviser ...      1\n",
       "4  SEATTLE/WASHINGTON (Reuters) - President Donal...      1\n",
       "5  WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...      1\n",
       "6  WEST PALM BEACH, Fla (Reuters) - President Don...      1\n",
       "7  The following statements were posted to the ve...      1\n",
       "8  The following statements were posted to the ve...      1\n",
       "9  WASHINGTON (Reuters) - Alabama Secretary of St...      1"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "### Fake news"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 23481 entries, 0 to 23480\n",
      "Data columns (total 2 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   text    23481 non-null  object\n",
      " 1   Value   23481 non-null  int64 \n",
      "dtypes: int64(1), object(1)\n",
      "memory usage: 367.0+ KB\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "None"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>Value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Donald Trump just couldn t wish all Americans ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>House Intelligence Committee Chairman Devin Nu...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>On Friday, it was revealed that former Milwauk...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>On Christmas day, Donald Trump announced that ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Pope Francis used his annual Christmas Day mes...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>The number of cases of cops brutalizing and ki...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Donald Trump spent a good portion of his day a...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>In the wake of yet another court decision that...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Many people have raised the alarm regarding th...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Just when you might have thought we d get a br...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text  Value\n",
       "0  Donald Trump just couldn t wish all Americans ...      0\n",
       "1  House Intelligence Committee Chairman Devin Nu...      0\n",
       "2  On Friday, it was revealed that former Milwauk...      0\n",
       "3  On Christmas day, Donald Trump announced that ...      0\n",
       "4  Pope Francis used his annual Christmas Day mes...      0\n",
       "5  The number of cases of cops brutalizing and ki...      0\n",
       "6  Donald Trump spent a good portion of his day a...      0\n",
       "7  In the wake of yet another court decision that...      0\n",
       "8  Many people have raised the alarm regarding th...      0\n",
       "9  Just when you might have thought we d get a br...      0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "true_news['Value'] = 1\n",
    "fake_news['Value'] = 0\n",
    "display(Markdown(r\"### True news\"))\n",
    "display(true_news.info())\n",
    "display(true_news.head(10))\n",
    "display(Markdown(r\"### Fake news\"))\n",
    "display(fake_news.info())\n",
    "display(fake_news.head(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>Value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>WASHINGTON (Reuters) - The head of a conservat...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>WASHINGTON (Reuters) - Transgender people will...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>WASHINGTON (Reuters) - The special counsel inv...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>WASHINGTON (Reuters) - Trump campaign adviser ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>SEATTLE/WASHINGTON (Reuters) - President Donal...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23476</th>\n",
       "      <td>21st Century Wire says As 21WIRE reported earl...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23477</th>\n",
       "      <td>21st Century Wire says It s a familiar theme. ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23478</th>\n",
       "      <td>Patrick Henningsen  21st Century WireRemember ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23479</th>\n",
       "      <td>21st Century Wire says Al Jazeera America will...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23480</th>\n",
       "      <td>21st Century Wire says As 21WIRE predicted in ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>44898 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    text  Value\n",
       "0      WASHINGTON (Reuters) - The head of a conservat...      1\n",
       "1      WASHINGTON (Reuters) - Transgender people will...      1\n",
       "2      WASHINGTON (Reuters) - The special counsel inv...      1\n",
       "3      WASHINGTON (Reuters) - Trump campaign adviser ...      1\n",
       "4      SEATTLE/WASHINGTON (Reuters) - President Donal...      1\n",
       "...                                                  ...    ...\n",
       "23476  21st Century Wire says As 21WIRE reported earl...      0\n",
       "23477  21st Century Wire says It s a familiar theme. ...      0\n",
       "23478  Patrick Henningsen  21st Century WireRemember ...      0\n",
       "23479  21st Century Wire says Al Jazeera America will...      0\n",
       "23480  21st Century Wire says As 21WIRE predicted in ...      0\n",
       "\n",
       "[44898 rows x 2 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# merging dataset\n",
    "dataset = pd.concat([true_news,fake_news],axis=0)\n",
    "display(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 44898 entries, 0 to 23480\n",
      "Data columns (total 2 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   text    44898 non-null  object\n",
      " 1   Value   44898 non-null  int64 \n",
      "dtypes: int64(1), object(1)\n",
      "memory usage: 1.0+ MB\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "None"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(dataset.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "### STD"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "y_train std: 0.49939397301167954\n",
      "y_val std: 0.4997839588710888\n",
      "y_test std: 0.4998194469400359\n"
     ]
    },
    {
     "data": {
      "text/markdown": [
       "### MEAN"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "y_train mean: 0.475249178684782\n",
      "y_val mean: 0.4835189309576837\n",
      "y_test mean: 0.4846325167037862\n"
     ]
    },
    {
     "data": {
      "text/markdown": [
       "### Count"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "y_train count: 35918\n",
      "y_val count: 4490\n",
      "y_test count: 4490\n"
     ]
    }
   ],
   "source": [
    "# creating train, val , test datasets dataset 8:1:1\n",
    "X_train, X_val_test, y_train, y_valtest = train_test_split(dataset[\"text\"],dataset[\"Value\"],test_size=0.2, shuffle=True)\n",
    "X_test, X_val, y_test, y_val = train_test_split(X_val_test,y_valtest,test_size=0.5, shuffle=True)\n",
    "display(Markdown(\"### STD\"))\n",
    "print(f\"y_train std: {y_train.std()}\")\n",
    "print(f\"y_val std: {y_val.std()}\")\n",
    "print(f\"y_test std: {y_test.std()}\")\n",
    "\n",
    "display(Markdown(\"### MEAN\"))\n",
    "print(f\"y_train mean: {y_train.mean()}\")\n",
    "print(f\"y_val mean: {y_val.mean()}\")\n",
    "print(f\"y_test mean: {y_test.mean()}\")\n",
    "\n",
    "display(Markdown(\"### Count\"))\n",
    "print(f\"y_train count: {y_train.count()}\")\n",
    "print(f\"y_val count: {y_val.count()}\")\n",
    "print(f\"y_test count: {y_test.count()}\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dl",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "6e9239598a6712340c2b580c5c929949b8a813e86738fb7cf0a67c11d0863b74"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}