Module dalex.datasets
Expand source code Browse git
from ._load import load_titanic, load_fifa, load_apartments, load_apartments_test, \
load_dragons, load_dragons_test, load_hr, load_hr_test, load_german
__all__ = [
"load_titanic",
"load_fifa",
"load_apartments",
"load_apartments_test",
"load_dragons",
"load_dragons_test",
"load_hr",
"load_hr_test",
"load_german"
]
Functions
def load_apartments()
-
Loads the artificial 'apartments' dataset
Datasets 'apartments' and 'apartments_test' are artificial, generated from the same model. Structure of the dataset is copied from the real dataset from the PBImisc R package, but they were generated in a way to mimic the effect of Anscombe quartet for complex black-box models.
Returns
pd.DataFrame
Expand source code Browse git
def load_apartments(): """Loads the artificial 'apartments' dataset Datasets 'apartments' and 'apartments_test' are artificial, generated from the same model. Structure of the dataset is copied from the real dataset from the PBImisc R package, but they were generated in a way to mimic the effect of Anscombe quartet for complex black-box models. Returns ----------- pd.DataFrame """ abs_dir_path = os.path.dirname(os.path.abspath(__file__)) abs_datasets_path = os.path.join(abs_dir_path, 'data', 'apartments.csv') dataset = pd.read_csv(abs_datasets_path, index_col=0) # use 1:1000 as index return dataset
def load_apartments_test()
-
Loads the artificial 'apartments_test' dataset
Datasets 'apartments' and 'apartments_test' are artificial, generated from the same model. Structure of the dataset is copied from the real dataset from the PBImisc R package, but they were generated in a way to mimic the effect of Anscombe quartet for complex black-box models.
Returns
pd.DataFrame
Expand source code Browse git
def load_apartments_test(): """Loads the artificial 'apartments_test' dataset Datasets 'apartments' and 'apartments_test' are artificial, generated from the same model. Structure of the dataset is copied from the real dataset from the PBImisc R package, but they were generated in a way to mimic the effect of Anscombe quartet for complex black-box models. Returns ----------- pd.DataFrame """ abs_dir_path = os.path.dirname(os.path.abspath(__file__)) abs_datasets_path = os.path.join(abs_dir_path, 'data', 'apartments_test.csv') dataset = pd.read_csv(abs_datasets_path, index_col=0) # use 1001:9000 as index return dataset
def load_dragons()
-
Load the artificial 'dragons' dataset
Datasets 'dragons' and 'dragons_test' are artificial, generated from the same ground truth model, but with sometimes different data distridution.
Returns
pd.DataFrame
Expand source code Browse git
def load_dragons(): """Load the artificial 'dragons' dataset Datasets 'dragons' and 'dragons_test' are artificial, generated from the same ground truth model, but with sometimes different data distridution. Returns ----------- pd.DataFrame """ abs_dir_path = os.path.dirname(os.path.abspath(__file__)) abs_datasets_path = os.path.join(abs_dir_path, 'data', 'dragons.csv') dataset = pd.read_csv(abs_datasets_path, index_col=0) # use 1:n as index return dataset
def load_dragons_test()
-
Load the artificial 'dragons_test' dataset
Datasets 'dragons' and 'dragons_test' are artificial, generated from the same ground truth model, but with sometimes different data distridution.
Returns
pd.DataFrame
Expand source code Browse git
def load_dragons_test(): """Load the artificial 'dragons_test' dataset Datasets 'dragons' and 'dragons_test' are artificial, generated from the same ground truth model, but with sometimes different data distridution. Returns ----------- pd.DataFrame """ abs_dir_path = os.path.dirname(os.path.abspath(__file__)) abs_datasets_path = os.path.join(abs_dir_path, 'data', 'dragons_test.csv') dataset = pd.read_csv(abs_datasets_path, index_col=0) # use 1:n as index return dataset
def load_fifa()
-
Load the preprocessed 'players_20' dataset
Load 'fifa', the preprocessed 'players_20.csv' dataset which comes as a part of 'FIFA 20 complete player dataset' at 'Kaggle'. It contains 5000 'overall' best players and 43 variables. These are: - short_name (index) - nationality of the player (not used in modeling) - overall, potential, value_eur, wage_eur (4 potential target variables) - age, height, weight, attacking skills, defending skills, goalkeeping skills (37 variables)
It is advised to leave only one target variable for modeling.
Format: pd.DataFrame with 5000 rows, 42 columns and index
Source: https://www.kaggle.com/stefanoleone992/fifa-20-complete-player-dataset#players_20.csv January 1, 2020
License: see file ./data/LICENSE-DATA.txt
Returns
pd.DataFrame
Expand source code Browse git
def load_fifa(): """Load the preprocessed 'players_20' dataset Load 'fifa', the preprocessed 'players_20.csv' dataset which comes as a part of 'FIFA 20 complete player dataset' at 'Kaggle'. It contains 5000 'overall' best players and 43 variables. These are: - short_name (index) - nationality of the player (not used in modeling) - overall, potential, value_eur, wage_eur (4 potential target variables) - age, height, weight, attacking skills, defending skills, goalkeeping skills (37 variables) It is advised to leave only one target variable for modeling. Format: pd.DataFrame with 5000 rows, 42 columns and index Source: https://www.kaggle.com/stefanoleone992/fifa-20-complete-player-dataset#players_20.csv January 1, 2020 License: see file ./data/LICENSE-DATA.txt Returns ----------- pd.DataFrame """ abs_dir_path = os.path.dirname(os.path.abspath(__file__)) abs_datasets_path = os.path.join(abs_dir_path, 'data', 'fifa.csv') dataset = pd.read_csv(abs_datasets_path, index_col=0) # use short_name as index return dataset
def load_german()
-
Load the preprocessed 'German Credit' dataset
Dataset 'german' contains information about people and their credit risk. On the base of age, purpose, credit amount, job, sex, etc. the model should predict the target - risk. risk tells if the credit rate will be good (1) or bad (0). This data contains some bias and it can be detected using the dalex.fairness module.
Format: pd.DataFrame with 1000 rows and 10 columns
Source: https://archive.ics.uci.edu/ml/datasets/Statlog+(German+Credit+Data)
Kaggle: https://www.kaggle.com/kabure/german-credit-data-with-risk/
License: see file ./data/LICENSE-DATA.txt
Returns
pd.DataFrame
Expand source code Browse git
def load_german(): """Load the preprocessed 'German Credit' dataset Dataset 'german' contains information about people and their credit risk. On the base of age, purpose, credit amount, job, sex, etc. the model should predict the target - risk. risk tells if the credit rate will be good (1) or bad (0). This data contains some bias and it can be detected using the dalex.fairness module. Format: pd.DataFrame with 1000 rows and 10 columns Source: https://archive.ics.uci.edu/ml/datasets/Statlog+(German+Credit+Data) Kaggle: https://www.kaggle.com/kabure/german-credit-data-with-risk/ License: see file ./data/LICENSE-DATA.txt Returns ----------- pd.DataFrame """ abs_dir_path = os.path.dirname(os.path.abspath(__file__)) abs_datasets_path = os.path.join(abs_dir_path, 'data', 'german.csv') dataset = pd.read_csv(abs_datasets_path, index_col=False) return dataset
def load_hr()
-
Load the artificial 'HR' dataset
Datasets 'HR' and 'HR_test' are artificial, generated from the same model. Structure of the dataset is based on the real data from the Human Resources department containing information about which employees were promoted or fired.
Returns
pd.DataFrame
Expand source code Browse git
def load_hr(): """Load the artificial 'HR' dataset Datasets 'HR' and 'HR_test' are artificial, generated from the same model. Structure of the dataset is based on the real data from the Human Resources department containing information about which employees were promoted or fired. Returns ----------- pd.DataFrame """ abs_dir_path = os.path.dirname(os.path.abspath(__file__)) abs_datasets_path = os.path.join(abs_dir_path, 'data', 'hr.csv') dataset = pd.read_csv(abs_datasets_path, index_col=0) # use 7847 numbers from 1:n as index return dataset
def load_hr_test()
-
Load the artificial 'HR_test' dataset
Datasets 'HR' and 'HR_test' are artificial, generated from the same model. Structure of the dataset is based on the real data from the Human Resources department containing information about which employees were promoted or fired.
Returns
pd.DataFrame
Expand source code Browse git
def load_hr_test(): """Load the artificial 'HR_test' dataset Datasets 'HR' and 'HR_test' are artificial, generated from the same model. Structure of the dataset is based on the real data from the Human Resources department containing information about which employees were promoted or fired. Returns ----------- pd.DataFrame """ abs_dir_path = os.path.dirname(os.path.abspath(__file__)) abs_datasets_path = os.path.join(abs_dir_path, 'data', 'hr_test.csv') dataset = pd.read_csv(abs_datasets_path, index_col=0) # use 7847 numbers from 1:n as index return dataset
def load_titanic()
-
Load the preprocessed 'titanic' dataset
Details: https://modeloriented.github.io/DALEX/reference/titanic.html
Returns
pd.DataFrame
Expand source code Browse git
def load_titanic(): """Load the preprocessed 'titanic' dataset Details: https://modeloriented.github.io/DALEX/reference/titanic.html Returns ----------- pd.DataFrame """ abs_dir_path = os.path.dirname(os.path.abspath(__file__)) abs_datasets_path = os.path.join(abs_dir_path, 'data', 'titanic.csv') dataset = pd.read_csv(abs_datasets_path) return dataset