ensemble_genetic_algorithm/config.yml.example at main · SamoraHunter/ensemble_genetic_algorithm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# --------------------------------------------------------------------------
# Example Configuration for the Ensemble Genetic Algorithm Project
#
# To use, copy this file to `config.yml` and uncomment/edit the parameters
# you wish to change. Any parameter not specified in `config.yml` will use
# its hardcoded default value, which is shown here for reference.
# --------------------------------------------------------------------------

# --- Global Parameters ---
# These settings control the overall behavior of the experiment.
# Corresponds to attributes in `ml_grid.util.global_params.global_parameters`.
global_params:
  # Path to the input dataset CSV file.
  # Default: "synthetic_data_for_testing.csv"
  # input_csv_path: "path/to/your/data.csv"

  # The total number of grid search iterations to perform.
  # Default: 1
  # n_iter: 50 # For a full experiment, a higher number is recommended.

  # If True, enables testing mode (e.g., smaller data samples).
  # Default: True
  # testing: False

  # Number of samples to use for the test set.
  # Default: 500
  # test_sample_n: 500

  # Number of columns to sample from the data (for testing). 0 means all columns.
  # Default: 30
  # column_sample_n: 30

  # List of base learner models to include in the search space.
  # Default: A list of all available models.
  # model_list:
  #   - "logisticRegression"
  #   - "perceptron"
  #   - "extraTrees"
  #   - "randomForest"
  #   - "kNearestNeighbors"
  #   - "XGBoost"
  #   - "DecisionTreeClassifier"
  #   - "AdaBoostClassifier"
  #   - "elasticNeuralNetwork"
  #   - "GaussianNB"
  #   - "QuadraticDiscriminantAnalysis"
  #   - "SVC"
  #   - "DummyModel" # For testing purposes

  # Verbosity level for console output (0-9).
  # Default: 3
  # verbose: 3

  # Debug level for detailed logging.
  # Default: 0
  # debug_level: 0

  # Number of parallel jobs for KNN models. -1 uses all available processors.
  # Default: -1
  # knn_n_jobs: -1

  # If True, sanitizes column names for compatibility with libraries like XGBoost.
  # Default: True
  # rename_cols: True

  # If True, raises exceptions during grid search; otherwise, logs them.
  # Default: True
  # error_raise: True

  # If True, uses RandomizedSearchCV instead of GridSearchCV.
  # Default: True
  # random_grid_search: True

  # Percentage of the parameter space to sample in a random grid search.
  # Default: 0.001
  # sub_sample_param_space_pct: 0.001

  # Number of parallel jobs for grid search. -1 uses all available cores.
  # Default: 4
  # grid_n_jobs: 4

  # Time in seconds after which a warning is printed for long model training times.
  # Default: 60
  # model_train_time_warning_threshold: 60

  # If True, saves trained base learners to disk. Can use a lot of space.
  # Note: This is a global default. The `store_base_learners` setting
  # in `grid_params` will override this for each individual grid search run.
  # Default: False
  # store_base_learners: False

  # Number of generations without improvement before the GA stops early.
  # Default: 5
  # gen_eval_score_threshold_early_stopping: 5

  # The base name for the log file that stores experiment results.
  # Default: "log_store_dataframe"
  # log_store_dataframe_path: "log_store_dataframe"

  # The root directory for saving project outputs (logs, models, etc.).
  # Default: "HFE_GA_experiments"
  # base_project_dir: "HFE_GA_experiments"

# --- Genetic Algorithm Parameters ---
# These control the evolution process itself (population size, generations, etc.).
# Corresponds to attributes in `ml_grid.util.grid_param_space_ga.Grid`.
# Note: These values are overridden if `global_params.testing` is set to `True`.
ga_params:
  # List of possible values for the number of base learners in an ensemble.
  # Default: [4, 8, 16, 32, 64]
  # nb_params: [4, 8, 16, 32, 64]

  # List of possible values for the population size.
  # Default: [32, 64, 128]
  # pop_params: [32, 64, 128]

  # List of possible values for the number of generations.
  # Default: [128]
  # g_params: [128]

# --- Grid Search Hyperparameters ---
# This section defines the search space for each experiment iteration.
# Corresponds to the `grid` dictionary in `ml_grid.util.grid_param_space_ga.Grid`.
grid_params:
  # --- Ensemble & Preprocessing Parameters ---
  # Methods for weighting base learner predictions. Options: "ann", "de", "unweighted".
  # Default: ["ann", "de", "unweighted"]
  # weighted: ["ann", "de", "unweighted"]

  # Whether to reuse stored base learners from previous runs.
  # Default: [False]
  # use_stored_base_learners: [False]

  # Whether to store newly trained base learners to disk for this run.
  # Default: [False]
  # store_base_learners: [False]

  # Resampling methods for imbalanced data. Options: "undersample", "oversample", None.
  # Default: ["undersample", "oversample", None]
  # resample: ["undersample", "oversample", None]

  # Whether to scale features.
  # Default: [True]
  # scale: [True]

  # Number of features to use.
  # Default: ["all"]
  # n_features: ["all"]

  # Size of hyperparameter space for base learners.
  # Default: ["medium"]
  # param_space_size: ["medium"]

  # Number of unique outcomes.
  # Default: [10]
  # n_unique_out: [10]

  # Outcome variable identifier (e.g., "outcome_var_1").
  # Default: ["1"]
  # outcome_var_n: ["1"]

  # Diversity penalty factor for ensemble evaluation.
  # Default: [0]
  # div_p: [0]

  # --- Feature Selection Parameters ---
  # Correlation threshold for removing columns.
  # Default: [0.9, 0.99]
  # corr: [0.9, 0.99]

  # Percentage of missing data allowed in a column before it's removed.
  # Default: [99.9, 99.8, 99.7]
  # percent_missing: [99.9, 99.8, 99.7]

  # Method for feature selection.
  # Default: ["anova"]
  # feature_selection_method: ["anova"]

  # --- Genetic Algorithm Hyperparameters (per grid search run) ---
  # Crossover probability.
  # Default: [0.5, 0.75, 0.25]
  # cxpb: [0.5, 0.75, 0.25]

  # Mutation probability.
  # Default: [0.2, 0.4, 0.8]
  # mutpb: [0.2, 0.4, 0.8]

  # Probability of individual gene mutation.
  # Default: [0.025, 0.05, 0.075]
  # indpb: [0.025, 0.05, 0.075]

  # Tournament size for selection.
  # Default: [3, 6, 9]
  # t_size: [3, 6, 9]

  # --- Data Feature Selection ---
  # This nested dictionary controls which groups of features are included in the search space.
  # The default for each feature group is a list of booleans, e.g., [True, False],
  # meaning the grid search will test both including and excluding that group.
  # To force inclusion, use [True]. To force exclusion, use [False].
  # data: [
  #   {
  #     age: [True],
  #     sex: [True],
  #     bmi: [True],
  #     ethnicity: [True],
  #     bloods: [True, False],
  #     diagnostic_order: [True, False],
  #     drug_order: [True, False],
  #     annotation_n: [True, False],
  #     meta_sp_annotation_n: [True, False],
  #     annotation_mrc_n: [True, False],
  #     meta_sp_annotation_mrc_n: [True, False],
  #     core_02: [True],
  #     bed: [True],
  #     vte_status: [False],
  #     hosp_site: [False],
  #     core_resus: [False],
  #     news: [True],
  #     date_time_stamp: [False],
  #     appointments: [False]
  #   }
  # ]