import argparse from pathlib import Path import pandas as pd from sklearn.model_selection import train_test_split parser = argparse.ArgumentParser( prog="Dataset", description="creating dataset or loading stats", ) parser.add_argument("--dataset", action="store_true", default=False) parser.add_argument("--stats", action="store_true", default=False) args = parser.parse_args() if not (args.dataset or args.stats): raise ValueError("NO RUN TYPE SPECIFIED") if args.dataset: TRUE_NEWS_PATH = Path("data/True.csv") FAKE_NEWS_PATH = Path("data/Fake.csv") DATA_PATH = TRUE_NEWS_PATH.parent DATASET_PATH = DATA_PATH / "dataset" # loading datasets true_news = pd.read_csv(TRUE_NEWS_PATH) fake_news = pd.read_csv(FAKE_NEWS_PATH) # clearing dataset true_news = true_news.drop(columns=["title", "subject", "date"]) fake_news = fake_news.drop(columns=["title", "subject", "date"]) # setting binary classification true_news["Value"] = 1 fake_news["Value"] = 0 # merging dataset dataset = pd.concat([true_news, fake_news], axis=0) dataset["text"] = dataset["text"].str.strip() dataset.dropna(axis=0, how="any", inplace=False, subset=["text"]) elif args.stats: DATA_PATH = Path("data") DATASET_FILE_PATH = DATA_PATH / "dataset/dataset.csv" dataset = pd.read_csv(DATASET_FILE_PATH) # spliting dataset for train,val,test X_train, X_val_test, y_train, y_valtest = train_test_split( dataset["text"], dataset["Value"], test_size=0.2, shuffle=True, random_state=20 ) X_test, X_val, y_test, y_val = train_test_split( X_val_test, y_valtest, test_size=0.5, shuffle=True, random_state=21 ) train_data = pd.concat([X_train, y_train], axis=1) val_data = pd.concat([X_val, y_val], axis=1) test_data = pd.concat([X_test, y_test], axis=1) if args.dataset: DATASET_PATH.mkdir(parents=True, exist_ok=True) dataset.to_csv((DATASET_PATH / "dataset.csv"), index=False) train_data.to_csv((DATASET_PATH / "train.csv"), index=False) val_data.to_csv((DATASET_PATH / "val.csv"), index=False) test_data.to_csv((DATASET_PATH / "test.csv"), index=False) print(dataset) if args.stats: std_stats = [y_train.std(), y_val.std(), y_test.std()] mean_stats = [y_train.mean(), y_val.mean(), y_test.mean()] count_stats = [y_train.count(), y_val.count(), y_test.count()] stats = pd.DataFrame( data=[std_stats, mean_stats, count_stats], index=["std", "mean", "count"], columns=["train", "val", "test"], ) stats.to_csv((DATA_PATH / "stats.csv")) print(stats)