Feature oops scaling (#25)

henry-wallace-phys · Henry Wallace · Henry Wallace · web-flow · commit cb4642311b5a · 2024-11-05T13:25:28.000Z
* Bump version, update requirements

* adds initial attempt at flow

* add linter

* Updates to let tensorflow based fitters run effectively

* Updates to all ML files to make MCMC running possible'

* Remove poorly implemented normalising flow work

* MEGA UPDATE: MCMC added, fixed scaling, working NN structure

* SPEEED

* Finally fixed file I/O

* some small typos, file IO works again

* renable pca...

* Remove unused files

* Update cfg reader

* Update Linter.yml

---------

Co-authored-by: Henry Wallace &lt;henryi@beluga4.int.ets1.calculquebec.ca&gt;
Co-authored-by: Henry Wallace &lt;henryi@bg12102.int.ets1.calculquebec.ca&gt;
Co-authored-by: Henry Wallace &lt;henryi@beluga2.int.ets1.calculquebec.ca&gt;
Co-authored-by: Henry Wallace &lt;henryi@beluga3.int.ets1.calculquebec.ca&gt;
diff --git a/.github/workflows/Linter.yml b/.github/workflows/Linter.yml
@@ -4,7 +4,7 @@ name: Super Linter
 
 on:
   pull_request:
-    branches: [develop]
+    branches: [main]
 
 jobs:
   super-linter:
diff --git a/src/MaCh3PythonUtils/config_reader/config_reader.py b/src/MaCh3PythonUtils/config_reader/config_reader.py
@@ -15,8 +15,7 @@
 
 from deepmerge import always_merger
 
-class ConfigReader:    
-    
+class ConfigReader:
     # Strictly unecessary but nice conceptually
     _file_handler = None
     _interface    = None
@@ -256,6 +255,7 @@ def __call__(self) -> None:
                 self._interface.run_likelihood_scan(self.__chain_settings["LikelihoodScanSettings"]["NDivisions"])
                 
             if self.__chain_settings["FileSettings"]["RunMCMC"] and self._interface is not None:
+                print("WARNING: MCMC HAS ONLY BEEN TESTED WITH TENSORFLOW INTERFACES!")
 
                 mcmc = MCMCMultGPU(self._interface,
                         self.__chain_settings["MCMCSettings"]["NChains"],
diff --git a/src/MaCh3PythonUtils/file_handling/chain_handler.py b/src/MaCh3PythonUtils/file_handling/chain_handler.py
@@ -160,10 +160,10 @@ def convert_ttree_to_array(self, close_file=True)->None:
         with ThreadPoolExecutor() as executor:
             # Make sure we have loads of memory available!
             # Ensures we don't run into funny behaviour when uncompressing
-            total_memory_needed = 6*self._posterior_ttree.uncompressed_bytes*(executor._max_workers) #in bytes
+            total_memory_needed = self._posterior_ttree.uncompressed_bytes #in bytes
 
             if self._verbose:
-                print(f"Using {executor._max_workers} threads and requiring {6*np.round(self._posterior_ttree.uncompressed_bytes*1e-9,3)} Gb memory")
+                print(f"Using {executor._max_workers} threads and requiring {np.round(self._posterior_ttree.uncompressed_bytes*1e-9,3)} Gb memory")
                 print("Using the following branches: ")
                 for i in self._plotting_branches:
                     print(f"  -> {i}")
diff --git a/src/MaCh3PythonUtils/fitters/adaption_handler_gpu.py b/src/MaCh3PythonUtils/fitters/adaption_handler_gpu.py
@@ -31,6 +31,10 @@ def update(self, new_data):
         """
         self.count += 1
         
+        # Arbitary stopping point!
+        if self.count>100000:
+            return
+        
         # Update mean and covariance using the class method
         self.mean, self.covariance = self.update_covariance(new_data)
 
diff --git a/src/MaCh3PythonUtils/fitters/multi_mcmc_gpu.py b/src/MaCh3PythonUtils/fitters/multi_mcmc_gpu.py
@@ -1,4 +1,5 @@
-import emcee
+import numpy as np
+import tensorflow as tf
 import numpy as np
 import tensorflow as tf
 from tensorflow import linalg as tfla
@@ -21,6 +22,17 @@ def __init__(self, interface: TfInterface, n_chains: int = 1024, circular_params
         self._n_chains = n_chains
 
         # Initial states for all chains
+        initial_state = tf.convert_to_tensor(np.zeros(self._n_dim), dtype=tf.float32)
+        self._chain_states = tf.Variable(tf.tile(tf.expand_dims(initial_state, axis=0), [n_chains, 1]), dtype=tf.float32)
+
+        # boundaries
+        self._upper_bounds = tf.convert_to_tensor(self._interface.scale_data(self._interface.chain.upper_bounds[:-1].reshape(1,-1)), dtype=tf.float32)
+        self._lower_bounds = tf.convert_to_tensor(self._interface.scale_data(self._interface.chain.lower_bounds[:-1].reshape(1,-1)), dtype=tf.float32)
+
+
+        self._circular_indices = self._get_circular_indices(circular_params)
+        print(self._circular_indices)
+
         initial_state = tf.convert_to_tensor(np.ones(self._n_dim), dtype=tf.float32)
         self._chain_states = tf.Variable(tf.tile(tf.expand_dims(initial_state, axis=0), [n_chains, 1]), dtype=tf.float32)
 
@@ -47,6 +59,11 @@ def __init__(self, interface: TfInterface, n_chains: int = 1024, circular_params
             shapes=[(self._n_chains, self._n_dim)]
         )
 
+    def _get_circular_indices(self, circular_params: List[str]):
+        """Map circular params to indices in self._interface.chain.plot_branches."""
+        return [self._interface.chain.plot_branches.index(param) for param in circular_params]
+
+
     def _estimate_batch_size(self):
         """Estimate batch size based on memory available to this process."""
         step_size_in_bytes = self._n_chains * self._n_dim * tf.float32.size
@@ -74,6 +91,26 @@ def _calc_likelihood(self, states: tf.Tensor):
     def propose_step_gpu(self):
         # Propose new states for all chains
         proposed_states = self._matrix_handler.sample(self._n_chains) + self._chain_states
+      
+        def apply_circular_bounds(idx):
+            # Extract specific bounds for the circular parameter
+            lower_bound = self._lower_bounds[0, idx]
+            upper_bound = self._upper_bounds[0, idx]
+            adjusted_values = lower_bound + tf.math.mod(proposed_states[:, idx] - upper_bound, upper_bound - lower_bound)
+            return tf.tensor_scatter_nd_update(
+                proposed_states,
+                indices=[[chain_idx, idx] for chain_idx in range(self._n_chains)],
+                updates=adjusted_values
+            )
+
+        # Apply circular bounds to indices marked as circular
+        for idx in self._circular_indices:
+            proposed_states = apply_circular_bounds(idx)
+
+
+        # Apply boundary conditions
+        proposed_states = tf.where(proposed_states < self._lower_bounds, self._chain_states, proposed_states)
+        proposed_states = tf.where(proposed_states > self._upper_bounds, self._chain_states, proposed_states)
 
         # Calculate log-likelihoods for proposed states
         proposed_loglikelihoods = self._calc_likelihood(proposed_states)
@@ -133,24 +170,33 @@ def _flush_async(self, final_flush=False):
             steps_to_write = self._queue.dequeue_many(self._batch_size_steps)
             end_idx = self._current_step
 
-            self._dataset[:end_idx, :] = steps_to_write
+            self._dataset[end_idx-len(steps_to_write):end_idx, :] = steps_to_write
+
 
     def save_mcmc_chain_to_pdf(self, filename: str, output_pdf: str):
         # Open the HDF5 file and read the chain
         with h5py.File(filename, 'r') as f:
             chain = f['chain'][:]
 
+        # Need it to reflect the actual parameters in our fit so let's combine everything!
+        rescaled_chain = [self._interface.invert_scaling(chain[1000:,i]) for i in range(self._n_chains)]
+        combined_rescaled_chain = np.concatenate(rescaled_chain, axis=0)
+                
         _, n_params = chain.shape[1:]
         
         # Create a PdfPages object to save plots
+        print("Plotting traces")
         with PdfPages(output_pdf) as pdf:
+            
+            # Rescale the chain
+            
             for i in tqdm(range(n_params)):
                 fig, ax = plt.subplots(figsize=(10, 6))
 
                 # Plot the chain for the i-th parameter
-                unscaled_data = self._interface.invert_scaling(chain[:, 0, i])
-                
-                ax.plot(unscaled_data, lw=0.5, label=f'Chain {i}')
+                # unscaled_data = self._interface.invert_scaling(chain[:, 0, i])
+                # for n, r in enumerate(rescaled_chain):
+                ax.plot(rescaled_chain[0][:, i], lw=0.5, label=f'Chain 0')
                 ax.set_ylabel(self._interface.chain.plot_branches[i])
                 ax.set_title(f"Parameter {self._interface.chain.plot_branches[i]} MCMC Chain")
                 ax.set_xlabel('Step')
@@ -159,27 +205,67 @@ def save_mcmc_chain_to_pdf(self, filename: str, output_pdf: str):
                 pdf.savefig(fig)
                 plt.close(fig)  # Close the figure to save memory
 
+
+        # Create a PdfPages object to save plots
+        print("Plotting posteriors")
+        with PdfPages(f"posterior_{output_pdf}") as pdf:
+            
+            # Rescale the chain
+            
+            for i in tqdm(range(n_params)):
+                fig, ax = plt.subplots(figsize=(10, 6))
+
+                # Plot the chain for the i-th parameter
+                # unscaled_data = self._interface.invert_scaling(chain[:, 0, i])
+                l = self._interface.chain.lower_bounds[i]
+                u = self._interface.chain.upper_bounds[i]
+                bins = np.linspace(l, u, 100)
+                
+                ax.hist(rescaled_chain[0][:, i], color='b', label="ML Pred", alpha=0.3, bins=bins, density=True)
+                ax.hist(self._interface.test_data.iloc[10000:,i].to_numpy(), color='r', label="Real Result", alpha=0.3, bins=bins, density=True)
+                
+                ax.set_xlabel(self._interface.chain.plot_branches[i])
+                ax.set_title(f"Parameter {self._interface.chain.plot_branches[i]} MCMC Chain")
+
+                ax.legend()
+                # Save the current figure to the PDF
+                pdf.savefig(fig)
+                plt.close(fig)  # Close the figure to save memory
+
+            print("Plotting AC")
+        with PdfPages(f"ac_{output_pdf}") as pdf:
+            for i in tqdm(range(n_params)):
+                fig, ax = plt.subplots(figsize=(10, 6))
+
+                # Plot the chain for the i-th parameter
+                # unscaled_data = self._interface.invert_scaling(chain[:, 0, i])
+                # for n, r in enumerate(rescaled_chain):
+                ac = sm.tsa.acf(rescaled_chain[0][:, i], nlags=len(rescaled_chain[0][:, 1]))
+                ax.plot(ac, lw=0.5, label=f'Chain 0')
+                ax.set_ylabel(self._interface.chain.plot_branches[i])
+                ax.set_title(f"Parameter {self._interface.chain.plot_branches[i]} MCMC Chain")
+                ax.set_xlabel('Autocorrelation')
+
+                # Save the current figure to the PDF
+                pdf.savefig(fig)
+                plt.close(fig)  # Close the figure to save memory
+
         print(f"MCMC chain plots saved to {output_pdf}")
 
     def __call__(self, n_steps, output_file_name: str):
         print(f"Running MCMC for {n_steps} steps with {self._n_chains} chains")
 
         # Open the HDF5 file in append mode
-        with h5py.File(output_file_name, 'a') as f:
+        with h5py.File(output_file_name, 'w') as f:
             # Create or resize the dataset
-            if 'chain' not in f:
-                # If dataset doesn't exist, create it
-                self._dataset = f.create_dataset('chain', (n_steps, self._n_chains, self._n_dim), chunks=True)
-            else:
-                # If dataset exists, resize it
-                self._dataset = f['chain']
-                self._dataset.resize((n_steps, self._n_chains, self._n_dim))
+            if 'chain' in f:
+                del f['chain']  # Delete if it already exists to avoid appending duplicate data
+
+            self._dataset = f.create_dataset('chain', (n_steps, self._n_chains, self._n_dim), chunks=True)
 
             for _ in tqdm(range(n_steps)):
                 self.propose_step()
 
             # Ensure remaining steps are flushed to disk
-            # self._flush_async(final_flush=True)
-
-            # Save the MCMC chain to PDF
+            self._flush_async(final_flush=True)
             self.save_mcmc_chain_to_pdf(output_file_name, "traces.pdf")
diff --git a/src/MaCh3PythonUtils/machine_learning/file_ml_interface.py b/src/MaCh3PythonUtils/machine_learning/file_ml_interface.py
@@ -119,6 +119,16 @@ def training_data(self)->pd.DataFrame:
         :rtype: pd.DataFrame
         """        
         return self._training_data
+
+    @property
+    def test_data(self)->pd.DataFrame:
+        """Gets training data
+
+        :return: Training data set
+        :rtype: pd.DataFrame
+        """        
+        return self._test_data
+
     
     def add_model(self, ml_model: Any)->None:
         """Add model to data set
diff --git a/src/MaCh3PythonUtils/machine_learning/tf_interface.py b/src/MaCh3PythonUtils/machine_learning/tf_interface.py
@@ -11,9 +11,14 @@ class TfInterface(FileMLInterface):
         "dropout": tf.keras.layers.Dropout,
     }
     
+    __TF_REGULARIZERS = {
+        "l2" : tf.keras.regularizers.L2
+    }
+    
     _layers = []
     _training_settings = {}
-        
+    
+    
         
     def add_layer(self, layer_id: str, layer_args: dict):
         """Add new layer to TF model
@@ -27,6 +32,12 @@ def add_layer(self, layer_id: str, layer_args: dict):
         if layer_id not in self.__TF_LAYER_IMPLEMENTATIONS.keys():
             raise ValueError(f"{layer_id} not implemented yet!")
 
+        if "kernel_regularizer" in layer_args.keys():
+            # Hacky, swaps string value of regularliser for proper one
+            reg = layer_args["kernel_regularizer"]
+            reg_name = list(reg.keys())[0]
+            layer_args["kernel_regularizer"] = self.__TF_REGULARIZERS[reg_name.lower()](reg[reg_name])
+
         self._layers.append(self.__TF_LAYER_IMPLEMENTATIONS[layer_id.lower()](**layer_args))
             
     def build_model(self, model_args: dict):