ium_424714/dane.ipynb at 97e9f584e7a0536094c5a50d7bff17cd876e0734

import numpy as np
import pandas as pd
from IPython.display import display,Markdown
from sklearn.model_selection import train_test_split

TRUE_NEWS_PATH = "data/True.csv"
FAKE_NEWS_PATH = "data/Fake.csv"

#loading datasets
true_news = pd.read_csv(TRUE_NEWS_PATH)
fake_news = pd.read_csv(FAKE_NEWS_PATH)

# clearing dataset
true_news = true_news.drop(columns=['title','subject','date'])

fake_news = fake_news.drop(columns=['title','subject','date'])

Seting binary classifiaction values

true_news['Value'] = 1
fake_news['Value'] = 0
display(Markdown(r"### True news"))
display(true_news.info())
display(true_news.head(10))
display(Markdown(r"### Fake news"))
display(fake_news.info())
display(fake_news.head(10))

True news

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    21417 non-null  object
 1   Value   21417 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 334.8+ KB

None

	text	Value
0	WASHINGTON (Reuters) - The head of a conservat...	1
1	WASHINGTON (Reuters) - Transgender people will...	1
2	WASHINGTON (Reuters) - The special counsel inv...	1
3	WASHINGTON (Reuters) - Trump campaign adviser ...	1
4	SEATTLE/WASHINGTON (Reuters) - President Donal...	1
5	WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...	1
6	WEST PALM BEACH, Fla (Reuters) - President Don...	1
7	The following statements were posted to the ve...	1
8	The following statements were posted to the ve...	1
9	WASHINGTON (Reuters) - Alabama Secretary of St...	1

Fake news

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    23481 non-null  object
 1   Value   23481 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 367.0+ KB

None

	text	Value
0	Donald Trump just couldn t wish all Americans ...	0
1	House Intelligence Committee Chairman Devin Nu...	0
2	On Friday, it was revealed that former Milwauk...	0
3	On Christmas day, Donald Trump announced that ...	0
4	Pope Francis used his annual Christmas Day mes...	0
5	The number of cases of cops brutalizing and ki...	0
6	Donald Trump spent a good portion of his day a...	0
7	In the wake of yet another court decision that...	0
8	Many people have raised the alarm regarding th...	0
9	Just when you might have thought we d get a br...	0

# merging dataset
dataset = pd.concat([true_news,fake_news],axis=0)
display(dataset)

	text	Value
0	WASHINGTON (Reuters) - The head of a conservat...	1
1	WASHINGTON (Reuters) - Transgender people will...	1
2	WASHINGTON (Reuters) - The special counsel inv...	1
3	WASHINGTON (Reuters) - Trump campaign adviser ...	1
4	SEATTLE/WASHINGTON (Reuters) - President Donal...	1
...	...	...
23476	21st Century Wire says As 21WIRE reported earl...	0
23477	21st Century Wire says It s a familiar theme. ...	0
23478	Patrick Henningsen 21st Century WireRemember ...	0
23479	21st Century Wire says Al Jazeera America will...	0
23480	21st Century Wire says As 21WIRE predicted in ...	0

44898 rows × 2 columns

display(dataset.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 23480
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    44898 non-null  object
 1   Value   44898 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.0+ MB

None

# creating train, val , test datasets dataset 8:1:1
X_train, X_val_test, y_train, y_valtest = train_test_split(dataset["text"],dataset["Value"],test_size=0.2, shuffle=True)
X_test, X_val, y_test, y_val = train_test_split(X_val_test,y_valtest,test_size=0.5, shuffle=True)
display(Markdown("### STD"))
print(f"y_train std: {y_train.std()}")
print(f"y_val std: {y_val.std()}")
print(f"y_test std: {y_test.std()}")

display(Markdown("### MEAN"))
print(f"y_train mean: {y_train.mean()}")
print(f"y_val mean: {y_val.mean()}")
print(f"y_test mean: {y_test.mean()}")

display(Markdown("### Count"))
print(f"y_train count: {y_train.count()}")
print(f"y_val count: {y_val.count()}")
print(f"y_test count: {y_test.count()}")

STD

y_train std: 0.49939397301167954
y_val std: 0.4997839588710888
y_test std: 0.4998194469400359

MEAN

y_train mean: 0.475249178684782
y_val mean: 0.4835189309576837
y_test mean: 0.4846325167037862

Count

y_train count: 35918
y_val count: 4490
y_test count: 4490

17 KiB Raw Blame History Unescape Escape

Seting binary classifiaction values

True news

Fake news

STD

MEAN

Count

17 KiB

Raw Blame History