ium_424714/dane.ipynb

17 KiB
Raw Blame History

import numpy as np
import pandas as pd
from IPython.display import display,Markdown
from sklearn.model_selection import train_test_split
TRUE_NEWS_PATH = "data/True.csv"
FAKE_NEWS_PATH = "data/Fake.csv"

#loading datasets
true_news = pd.read_csv(TRUE_NEWS_PATH)
fake_news = pd.read_csv(FAKE_NEWS_PATH)
# clearing dataset
true_news = true_news.drop(columns=['title','subject','date'])

fake_news = fake_news.drop(columns=['title','subject','date'])

Seting binary classifiaction values

true_news['Value'] = 1
fake_news['Value'] = 0
display(Markdown(r"### True news"))
display(true_news.info())
display(true_news.head(10))
display(Markdown(r"### Fake news"))
display(fake_news.info())
display(fake_news.head(10))

True news

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    21417 non-null  object
 1   Value   21417 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 334.8+ KB
None
text Value
0 WASHINGTON (Reuters) - The head of a conservat... 1
1 WASHINGTON (Reuters) - Transgender people will... 1
2 WASHINGTON (Reuters) - The special counsel inv... 1
3 WASHINGTON (Reuters) - Trump campaign adviser ... 1
4 SEATTLE/WASHINGTON (Reuters) - President Donal... 1
5 WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T... 1
6 WEST PALM BEACH, Fla (Reuters) - President Don... 1
7 The following statements were posted to the ve... 1
8 The following statements were posted to the ve... 1
9 WASHINGTON (Reuters) - Alabama Secretary of St... 1

Fake news

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    23481 non-null  object
 1   Value   23481 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 367.0+ KB
None
text Value
0 Donald Trump just couldn t wish all Americans ... 0
1 House Intelligence Committee Chairman Devin Nu... 0
2 On Friday, it was revealed that former Milwauk... 0
3 On Christmas day, Donald Trump announced that ... 0
4 Pope Francis used his annual Christmas Day mes... 0
5 The number of cases of cops brutalizing and ki... 0
6 Donald Trump spent a good portion of his day a... 0
7 In the wake of yet another court decision that... 0
8 Many people have raised the alarm regarding th... 0
9 Just when you might have thought we d get a br... 0
# merging dataset
dataset = pd.concat([true_news,fake_news],axis=0)
display(dataset)
text Value
0 WASHINGTON (Reuters) - The head of a conservat... 1
1 WASHINGTON (Reuters) - Transgender people will... 1
2 WASHINGTON (Reuters) - The special counsel inv... 1
3 WASHINGTON (Reuters) - Trump campaign adviser ... 1
4 SEATTLE/WASHINGTON (Reuters) - President Donal... 1
... ... ...
23476 21st Century Wire says As 21WIRE reported earl... 0
23477 21st Century Wire says It s a familiar theme. ... 0
23478 Patrick Henningsen 21st Century WireRemember ... 0
23479 21st Century Wire says Al Jazeera America will... 0
23480 21st Century Wire says As 21WIRE predicted in ... 0

44898 rows × 2 columns

display(dataset.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 23480
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    44898 non-null  object
 1   Value   44898 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.0+ MB
None
# creating train, val , test datasets dataset 8:1:1
X_train, X_val_test, y_train, y_valtest = train_test_split(dataset["text"],dataset["Value"],test_size=0.2, shuffle=True)
X_test, X_val, y_test, y_val = train_test_split(X_val_test,y_valtest,test_size=0.5, shuffle=True)
display(Markdown("### STD"))
print(f"y_train std: {y_train.std()}")
print(f"y_val std: {y_val.std()}")
print(f"y_test std: {y_test.std()}")

display(Markdown("### MEAN"))
print(f"y_train mean: {y_train.mean()}")
print(f"y_val mean: {y_val.mean()}")
print(f"y_test mean: {y_test.mean()}")

display(Markdown("### Count"))
print(f"y_train count: {y_train.count()}")
print(f"y_val count: {y_val.count()}")
print(f"y_test count: {y_test.count()}")

STD

y_train std: 0.49939397301167954
y_val std: 0.4997839588710888
y_test std: 0.4998194469400359

MEAN

y_train mean: 0.475249178684782
y_val mean: 0.4835189309576837
y_test mean: 0.4846325167037862

Count

y_train count: 35918
y_val count: 4490
y_test count: 4490