Refactor code to comply with Black formatting standards

SamoraHunter · SamoraHunter · commit 1bc3bbb2a634 · 2026-01-16T18:47:28.000Z
This commit applies automated formatting changes to improve code readability and consistency. Specifically, it addresses line length violations and argument wrapping across several modules, including H2O classifier wrappers, the grid search pipeline, and utility classes.

Changes include:

Wrapping long function calls and dictionary definitions in H2OBaseClassifier.py.
Breaking long lines in grid_search_cross_validate.py and hyperparameter_search.py.
Formatting dictionary access in project_score_save.py.
Updating test assertions in test_h2o_base_classifier.py to match the new style.
diff --git a/ml_grid/model_classes/H2OBaseClassifier.py b/ml_grid/model_classes/H2OBaseClassifier.py
@@ -354,8 +354,7 @@ def _prepare_fit(
         train_df = pd.concat([X, y_series], axis=1)
         # Optimization: Provide destination_frame to avoid expensive gc.get_referrers() name search
         train_h2o = h2o.H2OFrame(
-            train_df,
-            destination_frame=f"train_{uuid.uuid4().hex}"
+            train_df, destination_frame=f"train_{uuid.uuid4().hex}"
         )
 
         # Explicitly convert the outcome column to factor
@@ -412,7 +411,9 @@ def _get_model_params(self) -> Dict[str, Any]:
             self._estimator_signature_cache[self.estimator_class] = inspect.signature(
                 self.estimator_class
             ).parameters
-        valid_param_keys = set(self._estimator_signature_cache[self.estimator_class].keys())
+        valid_param_keys = set(
+            self._estimator_signature_cache[self.estimator_class].keys()
+        )
 
         model_params = {
             key: value for key, value in all_params.items() if key in valid_param_keys
@@ -633,7 +634,7 @@ def predict(self, X: pd.DataFrame) -> np.ndarray:
                 X,
                 column_names=self.feature_names_,
                 column_types=col_types,
-                destination_frame=f"pred_{uuid.uuid4().hex}"
+                destination_frame=f"pred_{uuid.uuid4().hex}",
             )
 
             # Optimization: Use the temporary frame directly.
@@ -761,7 +762,7 @@ def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
                 X,
                 column_names=self.feature_names_,
                 column_types=col_types,
-                destination_frame=f"prob_{uuid.uuid4().hex}"
+                destination_frame=f"prob_{uuid.uuid4().hex}",
             )
         except Exception as e:
             raise RuntimeError(f"Failed to create H2O frame for prediction: {e}")
@@ -926,7 +927,7 @@ def _get_param_names(self):
                 for p in init_signature.parameters.values()
                 if p.name not in ("self", "args", "kwargs")
             ]
-        
+
         init_params = self._init_param_names_cache[cls]
 
         # Optimization: Use sets for O(1) lookup
diff --git a/ml_grid/pipeline/grid_search_cross_validate.py b/ml_grid/pipeline/grid_search_cross_validate.py
@@ -231,9 +231,13 @@ def __init__(
             self.logger.debug("Dropping 'client_idcode' from training data.")
             self.X_train = self.X_train.drop(columns=["client_idcode"], errors="ignore")
             if isinstance(self.X_test, pd.DataFrame):
-                self.X_test = self.X_test.drop(columns=["client_idcode"], errors="ignore")
+                self.X_test = self.X_test.drop(
+                    columns=["client_idcode"], errors="ignore"
+                )
             if isinstance(self.X_test_orig, pd.DataFrame):
-                self.X_test_orig = self.X_test_orig.drop(columns=["client_idcode"], errors="ignore")
+                self.X_test_orig = self.X_test_orig.drop(
+                    columns=["client_idcode"], errors="ignore"
+                )
 
         max_param_space_iter_value = (
             self.global_params.max_param_space_iter_value
@@ -292,7 +296,9 @@ def __init__(
         if "catboost" in method_name.lower() and hasattr(
             current_algorithm, "set_params"
         ):
-            ml_grid_object.logger.info("Silencing CatBoost verbose output and file writing.")
+            ml_grid_object.logger.info(
+                "Silencing CatBoost verbose output and file writing."
+            )
             current_algorithm.set_params(verbose=0, allow_writing_files=False)
 
         # Check for GPU availability and set device for torch-based models
@@ -563,7 +569,9 @@ def __init__(
                     # --- OPTIMIZATION: Force threading backend for search ---
                     # Prevents 'loky' overhead (abort_everything ~273s) which occurs even with n_jobs=1
                     with joblib.parallel_backend("threading"):
-                        current_algorithm = search.run_search(X_train_reset, y_train_search)
+                        current_algorithm = search.run_search(
+                            X_train_reset, y_train_search
+                        )
 
             except TimeoutError:
                 self.logger.warning("Timeout occurred during hyperparameter search.")
diff --git a/ml_grid/pipeline/hyperparameter_search.py b/ml_grid/pipeline/hyperparameter_search.py
@@ -250,7 +250,9 @@ def run_search(self, X_train: pd.DataFrame, y_train: pd.Series) -> BaseEstimator
                 y_train_reset = y_train_reset.values
 
         # Force integer encoding if possible to speed up unique() calls
-        if hasattr(y_train_reset, "dtype") and not pd.api.types.is_integer_dtype(y_train_reset):
+        if hasattr(y_train_reset, "dtype") and not pd.api.types.is_integer_dtype(
+            y_train_reset
+        ):
             try:
                 y_train_reset = y_train_reset.astype(int)
             except (ValueError, TypeError):
diff --git a/ml_grid/util/project_score_save.py b/ml_grid/util/project_score_save.py
@@ -185,7 +185,7 @@ def update_score_log(
             logger = logging.getLogger("ml_grid")
             logger.info("Writing grid permutation to log")
             # write line to best grid scores---------------------
-            
+
             # --- OPTIMIZATION: Construct dictionary first to avoid slow DataFrame element-wise setting ---
             row_data = {}
             column_list = _get_score_log_columns(list(global_params.metric_list.keys()))
@@ -253,7 +253,9 @@ def update_score_log(
                     for key_1 in ml_grid_object.local_param_dict.get("data"):
                         # print(key_1)
                         if key_1 in column_list:
-                            row_data[key_1] = ml_grid_object.local_param_dict.get("data").get(key_1)
+                            row_data[key_1] = ml_grid_object.local_param_dict.get(
+                                "data"
+                            ).get(key_1)
 
             current_f = ml_grid_object.final_column_list
             # current_f = list(self.X_test.columns)
diff --git a/tests/test_h2o_base_classifier.py b/tests/test_h2o_base_classifier.py
@@ -213,7 +213,10 @@ def test_predict_successful(
 
     # 2. Check that the new frame creation logic was called
     mock_h2o_frame.assert_called_once_with(
-        X, column_names=list(X.columns), column_types=classifier_instance.feature_types_, destination_frame=ANY
+        X,
+        column_names=list(X.columns),
+        column_types=classifier_instance.feature_types_,
+        destination_frame=ANY,
     )
 
     # Optimization: h2o.assign and h2o.get_frame should NO LONGER be called