80 lines
2.5 KiB
Python
80 lines
2.5 KiB
Python
import argparse
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
parser = argparse.ArgumentParser(
|
|
prog="Dataset",
|
|
description="creating dataset or loading stats",
|
|
)
|
|
parser.add_argument("--dataset", action="store_true", default=False)
|
|
parser.add_argument("--stats", action="store_true", default=False)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not (args.dataset or args.stats):
|
|
raise ValueError("NO RUN TYPE SPECIFIED")
|
|
|
|
if args.dataset:
|
|
TRUE_NEWS_PATH = Path("data/True.csv")
|
|
FAKE_NEWS_PATH = Path("data/Fake.csv")
|
|
|
|
DATA_PATH = TRUE_NEWS_PATH.parent
|
|
DATASET_PATH = DATA_PATH / "dataset"
|
|
|
|
# loading datasets
|
|
true_news = pd.read_csv(TRUE_NEWS_PATH)
|
|
fake_news = pd.read_csv(FAKE_NEWS_PATH)
|
|
|
|
# clearing dataset
|
|
true_news = true_news.drop(columns=["title", "subject", "date"])
|
|
fake_news = fake_news.drop(columns=["title", "subject", "date"])
|
|
|
|
# setting binary classification
|
|
true_news["Value"] = 1
|
|
fake_news["Value"] = 0
|
|
|
|
# merging dataset
|
|
dataset = pd.concat([true_news, fake_news], axis=0)
|
|
dataset["text"] = dataset["text"].str.strip()
|
|
dataset.dropna(axis=0, how="any", inplace=False, subset=["text"])
|
|
|
|
elif args.stats:
|
|
DATA_PATH = Path("data")
|
|
DATASET_FILE_PATH = DATA_PATH / "dataset/dataset.csv"
|
|
dataset = pd.read_csv(DATASET_FILE_PATH)
|
|
# spliting dataset for train,val,test
|
|
X_train, X_val_test, y_train, y_valtest = train_test_split(
|
|
dataset["text"], dataset["Value"], test_size=0.2, shuffle=True, random_state=20
|
|
)
|
|
|
|
X_test, X_val, y_test, y_val = train_test_split(
|
|
X_val_test, y_valtest, test_size=0.5, shuffle=True, random_state=21
|
|
)
|
|
|
|
train_data = pd.concat([X_train, y_train], axis=1)
|
|
val_data = pd.concat([X_val, y_val], axis=1)
|
|
test_data = pd.concat([X_test, y_test], axis=1)
|
|
|
|
if args.dataset:
|
|
DATASET_PATH.mkdir(parents=True, exist_ok=True)
|
|
dataset.to_csv((DATASET_PATH / "dataset.csv"), index=False)
|
|
train_data.to_csv((DATASET_PATH / "train.csv"), index=False)
|
|
val_data.to_csv((DATASET_PATH / "val.csv"), index=False)
|
|
test_data.to_csv((DATASET_PATH / "test.csv"), index=False)
|
|
|
|
print(dataset)
|
|
|
|
if args.stats:
|
|
std_stats = [y_train.std(), y_val.std(), y_test.std()]
|
|
mean_stats = [y_train.mean(), y_val.mean(), y_test.mean()]
|
|
count_stats = [y_train.count(), y_val.count(), y_test.count()]
|
|
stats = pd.DataFrame(
|
|
data=[std_stats, mean_stats, count_stats],
|
|
index=["std", "mean", "count"],
|
|
columns=["train", "val", "test"],
|
|
)
|
|
stats.to_csv((DATA_PATH / "stats.csv"))
|
|
print(stats)
|