From 3497ff31661f7005d183dc7a1809f80d875c9646 Mon Sep 17 00:00:00 2001 From: dcajal Date: Mon, 2 Mar 2026 17:47:15 +0100 Subject: [PATCH 01/38] Add .gitignore --- .gitignore | 213 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f298c64 --- /dev/null +++ b/.gitignore @@ -0,0 +1,213 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ +data/ +graphs/ + +graphs +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock +#poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +#pdm.lock +#pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +#pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Cursor +# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to +# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data +# refer to https://docs.cursor.com/context/ignore-files +.cursorignore +.cursorindexingignore + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ +results_summaryEEG_I0002.csv +results_summaryEEG_I0004.csv +results_summaryEEG_I0006.csv +results_summaryEEG_I0007.csv From 3fd3769cb0f06ea4efe66598f4d0c739204aad03 Mon Sep 17 00:00:00 2001 From: dcajal Date: Mon, 2 Mar 2026 17:47:20 +0100 Subject: [PATCH 02/38] Add AUTHORS.txt with contributor names and affiliations --- AUTHORS.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 AUTHORS.txt diff --git a/AUTHORS.txt b/AUTHORS.txt new file mode 100644 index 0000000..143148b --- /dev/null +++ b/AUTHORS.txt @@ -0,0 +1,4 @@ +Sofia Romagnoli - Universidad de Zaragoza +Diego Cajal - CIBER-BBN +Josseline Madrid - Universidad de Zaragoza +Rodrigo Lozano - Universidad de Zaragoza From 2952aa1042398832157a584946425e74aacf1467 Mon Sep 17 00:00:00 2001 From: dcajal Date: Mon, 2 Mar 2026 17:47:43 +0100 Subject: [PATCH 03/38] Add models from previous challenges --- models/lolai_models.py | 2214 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2214 insertions(+) create mode 100644 models/lolai_models.py diff --git a/models/lolai_models.py b/models/lolai_models.py new file mode 100644 index 0000000..c93eb98 --- /dev/null +++ b/models/lolai_models.py @@ -0,0 +1,2214 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +import torch.nn as nn +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +import matplotlib.pyplot as plt +from captum.attr import ( + IntegratedGradients, + LayerGradCam, + LayerAttribution, + Occlusion, + GradientShap +) + +from typing import Dict, List, Tuple, Union, Optional + + +class CNN_LSTM_Classifier(nn.Module): + def __init__(self, input_channels=3, hidden_dim=64, num_classes=3, dropout=0.3): + super(CNN_LSTM_Classifier, self).__init__() + + self.cnn = nn.Sequential( + nn.Conv1d(input_channels, 6, kernel_size=5, padding=2), + nn.BatchNorm1d(6), + nn.ReLU(), + nn.MaxPool1d(kernel_size=2), + nn.Dropout(dropout), + + nn.Conv1d(6, 9, kernel_size=3, padding=1), + nn.BatchNorm1d(9), + nn.ReLU(), + nn.MaxPool1d(kernel_size=2), + nn.Dropout(dropout), + + + nn.Conv1d(9, 18, kernel_size=3, padding=1), + nn.BatchNorm1d(18), + nn.ReLU(), + nn.MaxPool1d(kernel_size=2), + nn.Dropout(dropout) + ) + + self.lstm = nn.LSTM( + input_size=18, + hidden_size=hidden_dim, + batch_first=True, + bidirectional=True # Use bidirectional LSTM for better context + ) + + self.classifier = nn.Sequential( + nn.Linear(2 * hidden_dim, 64), # Adjust for bidirectional LSTM + nn.ReLU(), + nn.Dropout(dropout), + nn.Linear(64, num_classes) + ) + + def forward(self, x): + # x: (batch, channels, time) + x = self.cnn(x) # (batch, features, time) + x = x.permute(0, 2, 1) # (batch, time, features) + _, (h_n, _) = self.lstm(x) # h_n: (num_layers * num_directions, batch, hidden_dim) + h_n = torch.cat((h_n[-2], h_n[-1]), dim=1) # Concatenate forward and backward states + out = self.classifier(h_n) # (batch, num_classes) + return out + +import torch +import torch.nn as nn +import torch.nn.functional as F +import matplotlib.pyplot as plt +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class CNN_LSTM_Classifier_XAI(nn.Module): + def __init__(self, input_channels=3, hidden_dim=32, num_classes=3, dropout=0.4): + super(CNN_LSTM_Classifier_XAI, self).__init__() + + self.cnn_activations = [] + self.lstm_activations = None + self.attention_weights = None + self.gradients = None + self.last_cnn_output = None + self.input = None + + # CNN + self.conv1 = nn.Conv1d(input_channels, 16, kernel_size=5, padding=2) + self.bn1 = nn.BatchNorm1d(16) + self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm1d(32) + self.conv3 = nn.Conv1d(32, 64, kernel_size=3, padding=1) + self.bn3 = nn.BatchNorm1d(64) + self.pool = nn.MaxPool1d(kernel_size=2) + self.dropout = nn.Dropout(dropout) + + # LSTM + self.lstm = nn.LSTM(input_size=64, hidden_size=hidden_dim, + batch_first=True, bidirectional=True) + + # Atención + self.attention = nn.Linear(2 * hidden_dim, 1) + + # Clasificador + self.classifier = nn.Sequential( + nn.Linear(2 * hidden_dim, 32), + nn.ReLU(), + nn.Dropout(dropout), + nn.Linear(32, num_classes) + ) + + def activations_hook(self, grad): + self.gradients = grad + + def forward(self, x, return_attention=False, track_gradients=False): + self.input = x + + x = self.conv1(x) + x = self.bn1(x) + x = F.relu(x) + self.cnn_activations.append(x.detach()) + x = self.pool(x) + x = self.dropout(x) + + x = self.conv2(x) + x = self.bn2(x) + x = F.relu(x) + self.cnn_activations.append(x.detach()) + x = self.pool(x) + x = self.dropout(x) + + x = self.conv3(x) + x = self.bn3(x) + x = F.relu(x) + cnn_output = x + + if track_gradients and cnn_output.requires_grad: + cnn_output.register_hook(self.activations_hook) + + self.last_cnn_output = cnn_output # necesario para Grad-CAM + self.cnn_activations.append(cnn_output.detach()) + + x = self.pool(cnn_output) + x = self.dropout(x) + + x = x.permute(0, 2, 1) # (batch, time, features) + lstm_out, _ = self.lstm(x) + self.lstm_activations = lstm_out.detach() + + attention_scores = self.attention(lstm_out).squeeze(-1) + attention_weights = F.softmax(attention_scores, dim=1) + self.attention_weights = attention_weights.detach() + + context_vector = torch.bmm(attention_weights.unsqueeze(1), lstm_out).squeeze(1) + out = self.classifier(context_vector) + + if return_attention: + return out, attention_weights + return out + + def reset_activation_storage(self): + self.cnn_activations = [] + self.lstm_activations = None + self.attention_weights = None + self.gradients = None + self.last_cnn_output = None + self.input = None + + def interpret(self, x, class_idx=None): + self.reset_activation_storage() + + was_training = self.training + lstm_was_training = self.lstm.training + + self.eval() + self.lstm.train() # necesario para CuDNN backward + + x.requires_grad_() + logits, attention = self.forward(x, return_attention=True, track_gradients=True) + pred = torch.softmax(logits, dim=1) + + if class_idx is None: + class_idx = pred.argmax(dim=1) + + for i in range(x.shape[0]): + pred[i, class_idx[i]].backward(retain_graph=True) + + self.train(was_training) + self.lstm.train(lstm_was_training) + + feature_importance = self.get_feature_importance() + temporal_channel_importance = self.get_temporal_channel_importance() + channel_imp=self.get_channel_importance() + + self.input = None # limpieza para evitar problemas de memoria + torch.cuda.empty_cache() + + return { + 'prediction': pred.detach(), + 'class_idx': class_idx, + 'attention_weights': self.attention_weights, + 'feature_importance': feature_importance, + 'cnn_activations': self.cnn_activations, + 'temporal_channel_importance': temporal_channel_importance, + 'channel_importance': channel_imp + } + + def get_feature_importance(self): + """ + Grad-CAM temporal sobre la salida del último bloque CNN. + Devuelve tensor (batch, time) + """ + if self.gradients is None or self.last_cnn_output is None: + return None + + pooled_gradients = torch.mean(self.gradients, dim=[0, 2]) # (channels,) + cam = self.last_cnn_output.clone() + + for i in range(cam.shape[1]): + cam[:, i, :] *= pooled_gradients[i] + + heatmap = torch.mean(cam, dim=1).detach() # (batch, time) + return heatmap + + def get_channel_importance(self): + """ + Importancia por canal: (batch, channels) + """ + if self.input.grad is None: + raise ValueError("Gradientes de la entrada no están disponibles. Llama primero a interpret().") + return self.input.grad.abs().mean(dim=2) + + def get_temporal_channel_importance(self): + """ + Importancia canal-temporal: (batch, channels, time) + """ + if self.input.grad is None: + raise ValueError("Gradients of the input are not available. Call interpret() first.") + return self.input.grad.abs().detach() + + + + + + +class ContrastiveVAE(nn.Module): + def __init__(self, in_channels=4, latent_dim=32, lstm_hidden=64, n_classes=3, use_classifier=False): + super().__init__() + self.use_classifier = use_classifier + + # Encoder + self.encoder = nn.Sequential( + nn.Conv1d(in_channels, 32, kernel_size=3, padding=1), + nn.ReLU(), + nn.Conv1d(32, 64, kernel_size=3, padding=1), + nn.ReLU() + ) + + self.global_pool = nn.AdaptiveAvgPool1d(1) # for VAE path + self.fc_mu = nn.Linear(64, latent_dim) + self.fc_logvar = nn.Linear(64, latent_dim) + + # Decoder (for reconstruction) + self.decoder_input = nn.Linear(latent_dim, 64) + self.decoder = nn.Sequential( + nn.ConvTranspose1d(64, 32, kernel_size=3, padding=1), + nn.ReLU(), + nn.ConvTranspose1d(32, in_channels, kernel_size=3, padding=1) + ) + + # LSTM + Classifier always initialized (but optionally used) + self.lstm = nn.LSTM(input_size=64, hidden_size=lstm_hidden, batch_first=True, bidirectional=True) + self.classifier = nn.Sequential( + nn.Linear(lstm_hidden * 2, 64), + nn.ReLU(), + nn.Linear(64, n_classes) + ) + + def encode(self, x): + h = self.encoder(x) # (B, 64, T) + pooled = self.global_pool(h).squeeze(-1) # (B, 64) + mu = self.fc_mu(pooled) + logvar = self.fc_logvar(pooled) + return mu, logvar, h # h is (B, 64, T) + + def reparameterize(self, mu, logvar): + std = torch.exp(0.5 * logvar) + eps = torch.randn_like(std) + return mu + eps * std + + def decode(self, z, length): + # Mejora la reconstrucción con una capa de proyección inicial + h = self.decoder_input(z).unsqueeze(-1) # (B, 64, 1) + # Usar interpolación para un escalado más suave en lugar de expand + h = F.interpolate(h, size=length, mode='linear', align_corners=False) + x_recon = self.decoder(h) + return x_recon + + def forward(self, x): + B, C, T = x.shape + mu, logvar, features = self.encode(x) # features: (B, 64, T) + z = self.reparameterize(mu, logvar) + x_recon = self.decode(z, T) + + logits = None + if self.use_classifier: + features_t = features.permute(0, 2, 1) # (B, T, 64) + lstm_out, _ = self.lstm(features_t) # (B, T, 2*hidden) + lstm_feat = lstm_out.mean(dim=1) # (B, 2*hidden) + logits = self.classifier(lstm_feat) # (B, n_classes) + + return x_recon, mu, logvar, z, logits + + def get_latents(self, x, use_mean=True): + mu, logvar, _ = self.encode(x) + return mu if use_mean else self.reparameterize(mu, logvar) + + def classify(self, x): + """Forward through the classifier only (requires use_classifier = True).""" + assert self.use_classifier, "Classifier is not enabled. Set model.use_classifier = True before calling classify." + _, _, features = self.encode(x) + features_t = features.permute(0, 2, 1) + lstm_out, _ = self.lstm(features_t) + lstm_feat = lstm_out.mean(dim=1) + return self.classifier(lstm_feat) + + +def vae_loss(recon_x, x, mu, logvar): + recon_loss = F.mse_loss(recon_x, x, reduction='mean') + kl_div = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp()) + return recon_loss + kl_div, recon_loss, kl_div + + +def contrastive_loss(z, ids, temperature=0.1): + z = F.normalize(z, dim=1) + sim = torch.mm(z, z.T) / temperature + labels = ids.view(-1, 1) + mask = torch.eq(labels, labels.T).float().to(z.device) + mask = mask - torch.eye(len(z), device=z.device) + exp_sim = torch.exp(sim) * (1 - torch.eye(len(z), device=z.device)) + log_prob = sim - torch.log(exp_sim.sum(1, keepdim=True) + 1e-8) + mean_log_prob_pos = (mask * log_prob).sum(1) / (mask.sum(1) + 1e-8) + return -mean_log_prob_pos.mean() + + +def intra_patient_loss(latents, patient_ids): + """Versión optimizada que evita bucles explícitos""" + # Convertir IDs a tensor si no lo son ya + if not isinstance(patient_ids, torch.Tensor): + patient_ids = torch.tensor(patient_ids, device=latents.device) + + # Crear matriz de similaridad de pacientes (1 donde son iguales) + patient_sim = (patient_ids.unsqueeze(1) == patient_ids.unsqueeze(0)).float() + # Quitar diagonal (mismo ejemplo) + mask = patient_sim - torch.eye(len(latents), device=latents.device) + # Calcular distancias entre latentes + latent_dists = torch.cdist(latents, latents, p=2) + # Aplicar máscara y promediar + valid_pairs = mask.sum() + if valid_pairs > 0: + return (mask * latent_dists).sum() / valid_pairs + return torch.tensor(0.0, device=latents.device) + + +def training_step(model, batch1, batch2, patient_ids, optimizer, alpha=0.1, beta=1.0): + model.train() + x1, x2 = batch1, batch2 + x1_recon, mu1, logvar1, z1, _ = model(x1) + x2_recon, mu2, logvar2, z2, _ = model(x2) + + recon1, r1, kl1 = vae_loss(x1_recon, x1, mu1, logvar1) + recon2, r2, kl2 = vae_loss(x2_recon, x2, mu2, logvar2) + vae_total = (recon1 + recon2) / 2 + + z_all = torch.cat([z1, z2], dim=0) + # Asegurar que IDs sean tensores + ids = torch.arange(len(z1), device=z1.device).repeat(2) + contrastive = contrastive_loss(z_all, ids) + + # Extender patient_ids correctamente + if isinstance(patient_ids, torch.Tensor): + p_ids = torch.cat([patient_ids, patient_ids], dim=0) + else: + p_ids = patient_ids + patient_ids # Si es una lista + + patient_reg = intra_patient_loss(z_all, p_ids) + + total = vae_total + alpha * contrastive + beta * patient_reg + + optimizer.zero_grad() + total.backward() + optimizer.step() + + return { + 'total_loss': total.item(), + 'recon_loss': vae_total.item(), + 'contrastive': contrastive.item(), + 'patient_reg': patient_reg.item(), + 'kl_loss': (kl1 + kl2).item() / 2 + } + + +def fine_tune_step(model, x, y, optimizer, criterion=nn.CrossEntropyLoss()): + model.train() + model.use_classifier = True + logits = model.classify(x) + loss = criterion(logits, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + preds = torch.argmax(logits, dim=1) + acc = (preds == y).float().mean().item() + + return { + 'classification_loss': loss.item(), + 'accuracy': acc + } + +class ImprovedPainClassifier(nn.Module): + def __init__(self, input_channels=3, hidden_dim=128, num_classes=3, dropout=0.4): + super(ImprovedPainClassifier, self).__init__() + + # Increased regularization and feature extraction for small datasets + self.cnn = nn.Sequential( + # Layer 1: More filters to capture diverse patterns + nn.Conv1d(input_channels, 64, kernel_size=3, padding=1), + nn.BatchNorm1d(64), + nn.LeakyReLU(0.1), # LeakyReLU helps with gradient flow + nn.MaxPool1d(kernel_size=2), + + # Layer 2: Increased complexity + nn.Conv1d(64, 128, kernel_size=3, padding=1), + nn.BatchNorm1d(128), + nn.LeakyReLU(0.1), + nn.MaxPool1d(kernel_size=2), + nn.Dropout(dropout), + + # Layer 3: Additional layer for better feature extraction + nn.Conv1d(128, 128, kernel_size=3, padding=1), + nn.BatchNorm1d(128), + nn.LeakyReLU(0.1), + nn.Dropout(dropout) + ) + + # Attention mechanism to focus on important temporal patterns + self.attention = nn.Sequential( + nn.Linear(256, 64), + nn.Tanh(), + nn.Linear(64, 1) + ) + + # Bidirectional LSTM with residual connections + self.lstm = nn.LSTM( + input_size=128, + hidden_size=hidden_dim, + num_layers=2, # Multiple layers for complex temporal patterns + batch_first=True, + bidirectional=True, + dropout=dropout # Apply dropout between LSTM layers + ) + + # Classifier with additional regularization + self.classifier = nn.Sequential( + nn.Linear(2 * hidden_dim, hidden_dim), + nn.BatchNorm1d(hidden_dim), # Normalize activations + nn.LeakyReLU(0.1), + nn.Dropout(dropout), + nn.Linear(hidden_dim, hidden_dim // 2), + nn.LeakyReLU(0.1), + nn.Dropout(dropout), + nn.Linear(hidden_dim // 2, num_classes) + ) + + def forward(self, x): + # x: (batch, channels, time) - BVP, EDA, and respiratory signals + + # Extract features with CNN + cnn_out = self.cnn(x) # (batch, 128, time') + + # Reshape for LSTM + cnn_out = cnn_out.permute(0, 2, 1) # (batch, time', 128) + + # Process with LSTM + lstm_out, (h_n, _) = self.lstm(cnn_out) # lstm_out: (batch, time', 2*hidden_dim) + + # Apply attention to focus on relevant parts of the signal + attn_weights = self.attention(lstm_out).softmax(dim=1) # (batch, time', 1) + context = torch.sum(attn_weights * lstm_out, dim=1) # (batch, 2*hidden_dim) + + # Alternative: Use concatenated hidden states from both directions + # h_n = torch.cat((h_n[-2], h_n[-1]), dim=1) # (batch, 2*hidden_dim) + + # Classify + out = self.classifier(context) # (batch, num_classes) + return out + + +import matplotlib.pyplot as plt +import numpy as np +import torch + +class ExplainabilityVisualizer: + def __init__(self, channel_names=None): + """ + channel_names: lista opcional con nombres de los canales de entrada + """ + self.channel_names = channel_names + + def plot_attention_weights(self, attention_weights, title="Atención temporal"): + attention = attention_weights.squeeze().cpu().numpy() + plt.figure(figsize=(10, 2)) + plt.plot(attention) + plt.title(title) + plt.xlabel("Timestep") + plt.ylabel("Weight") + plt.grid(True) + plt.tight_layout() + plt.show() + + def plot_gradcam_heatmap(self, heatmap, title="Grad-CAM temporal"): + heat = heatmap.squeeze().cpu().numpy() + plt.figure(figsize=(10, 2)) + plt.plot(heat) + plt.title(title) + plt.xlabel("Timestep") + plt.ylabel("Importance") + plt.grid(True) + plt.tight_layout() + plt.show() + + def plot_channel_importance(self, channel_importance, title="Importancia por canal"): + values = channel_importance.squeeze().cpu().numpy() + channels = self.channel_names if self.channel_names else [f"Channel {i}" for i in range(len(values))] + plt.figure(figsize=(6, 3)) + plt.bar(channels, values) + plt.title(title) + plt.ylabel("Importancia media") + plt.xticks(rotation=45) + plt.grid(axis='y') + plt.tight_layout() + plt.show() + + def plot_signals_with_attention_highlight(self, x, importance, threshold=0.85, title="Señal con zonas de atención"): + """ + Dibuja las señales multicanal y sombreado rojo donde la importancia temporal supera el umbral. + + Args: + x: Tensor (channels, time) + importance: Tensor (time,) + threshold: percentil (0-1) o valor absoluto + title: título del gráfico + """ + x = x.detach().cpu().numpy() + importance = importance.detach().cpu().numpy() + time = np.arange(x.shape[1]) + n_channels = x.shape[0] + + if threshold <= 1.0: + threshold_value = np.quantile(importance, threshold) + else: + threshold_value = threshold + + high_attention_mask = importance >= threshold_value + + fig, axs = plt.subplots(n_channels, 1, figsize=(12, 2.5 * n_channels), sharex=True) + if n_channels == 1: + axs = [axs] + + for i in range(n_channels): + axs[i].plot(time, x[i], label=self.channel_names[i] if self.channel_names else f"Canal {i}", color="black") + axs[i].set_ylabel("Valor") + axs[i].grid(True) + + in_high = False + start = 0 + for t in range(len(high_attention_mask)): + if high_attention_mask[t] and not in_high: + start = t + in_high = True + elif not high_attention_mask[t] and in_high: + axs[i].axvspan(start, t, color='red', alpha=0.25) + in_high = False + if in_high: + axs[i].axvspan(start, len(high_attention_mask), color='red', alpha=0.25) + + axs[i].legend(loc="upper right") + + axs[-1].set_xlabel("Tiempo (muestras)") + plt.suptitle(title) + plt.tight_layout() + plt.show() + + + +class FocalLoss(nn.Module): + """ + Focal Loss para clasificación binaria y multiclase. + + Parámetros: + - alpha: Factor de ponderación para manejar desequilibrio de clases. + Puede ser un escalar (mismo valor para todas las clases) o + un tensor (valores específicos por clase). + - gamma: Factor de modulación para enfocar en ejemplos difíciles (>= 0). + - reduction: 'none' | 'mean' | 'sum' + - eps: Pequeño valor para estabilidad numérica + + Referencias: + - Paper original: "Focal Loss for Dense Object Detection" por Lin et al. + """ + def __init__(self, alpha=0.25, gamma=2.0, reduction='mean', eps=1e-6): + super(FocalLoss, self).__init__() + self.alpha = alpha + self.gamma = gamma + self.reduction = reduction + self.eps = eps + + def forward(self, inputs, targets): + """ + Args: + inputs: Logits de forma [B, C] donde B es el tamaño del batch y C es el número de clases. + Para clasificación binaria, C puede ser 1. + targets: Etiquetas de objetivos de forma [B] para multiclase o [B, 1] para binaria. + Valores enteros para multiclase (clases indexadas desde 0 a C-1). + Valores continuos entre 0 y 1 para binaria. + """ + # Determinar si es clasificación binaria o multiclase + if inputs.shape[1] == 1 or inputs.shape[1] == 2: # Binaria + # Aplicar sigmoide para obtener probabilidades + probs = torch.sigmoid(inputs.view(-1)) + targets = targets.view(-1) + + # Calcular pt (probabilidad del objetivo correcto) + pt = probs * targets + (1 - probs) * (1 - targets) + + # Aplicar factores de ponderación + if isinstance(self.alpha, (float, int)): + alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets) + else: + # Si alpha es un tensor, usar indexación + alpha_t = self.alpha if self.alpha is not None else torch.ones_like(pt) + + # Calcular la focal loss + focal_weight = (1 - pt).pow(self.gamma) + loss = -alpha_t * focal_weight * torch.log(pt.clamp(min=self.eps)) + + else: # Multiclase + # Convertir logits a distribución de probabilidad + log_softmax = F.log_softmax(inputs, dim=1) + + # Obtener log probabilidad para las clases objetivo + targets = targets.view(-1, 1) + log_pt = log_softmax.gather(1, targets).view(-1) + pt = log_pt.exp() # Obtener probabilidades + + # Aplicar factores de ponderación + if isinstance(self.alpha, (list, tuple, torch.Tensor)): + # Si alpha es específico por clase + alpha = torch.tensor(self.alpha, device=inputs.device) + alpha_t = alpha.gather(0, targets.view(-1)) + else: + alpha_t = self.alpha if self.alpha is not None else 1.0 + + # Calcular focal loss + focal_weight = (1 - pt).pow(self.gamma) + loss = -alpha_t * focal_weight * log_pt + + # Aplicar reduction + if self.reduction == 'mean': + return loss.mean() + elif self.reduction == 'sum': + return loss.sum() + else: # 'none' + return loss + + + + +class CNN_LSTM_Classifier_XAI_2(nn.Module): + def __init__(self, input_channels=3, hidden_dim=32, num_classes=3, dropout=0.1): + super(CNN_LSTM_Classifier_XAI_2, self).__init__() + + self.cnn_activations = [] + self.lstm_activations = None + self.attention_weights = None + self.gradients = None + self.last_cnn_output = None + self.input = None + self.input_channels = input_channels + + # CNN + self.conv1 = nn.Conv1d(input_channels, 16, kernel_size=5, padding=2) + self.bn1 = nn.BatchNorm1d(16) + self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm1d(32) + self.conv3 = nn.Conv1d(32, 64, kernel_size=3, padding=1) + self.bn3 = nn.BatchNorm1d(64) + self.pool = nn.MaxPool1d(kernel_size=2) + self.dropout = nn.Dropout(dropout) + + # LSTM + self.lstm = nn.LSTM(input_size=64, hidden_size=hidden_dim, + batch_first=True, bidirectional=True) + + # Attention + self.attention = nn.Linear(2 * hidden_dim, 1) + + # Classifier + self.classifier = nn.Sequential( + nn.Linear(2 * hidden_dim, 32), + nn.ReLU(), + nn.Dropout(dropout), + nn.Linear(32, num_classes) + ) + + def activations_hook(self, grad): + self.gradients = grad + + + + def forward(self, x, return_attention=False, track_gradients=False): + # Reset activation storage at the beginning of each forward pass + self.reset_activation_storage() + + self.input = x + + x = self.conv1(x) + x = self.bn1(x) + x = F.relu(x) + self.cnn_activations.append(x.detach()) + x = self.pool(x) + x = self.dropout(x) + + x = self.conv2(x) + x = self.bn2(x) + x = F.relu(x) + self.cnn_activations.append(x.detach()) + x = self.pool(x) + x = self.dropout(x) + + x = self.conv3(x) + x = self.bn3(x) + x = F.relu(x) + cnn_output = x + + if track_gradients and cnn_output.requires_grad: + cnn_output.register_hook(self.activations_hook) + + self.last_cnn_output = cnn_output # needed for Grad-CAM + self.cnn_activations.append(cnn_output.detach()) + + x = self.pool(cnn_output) + x = self.dropout(x) + + x = x.permute(0, 2, 1) # (batch, time, features) + lstm_out, (h_n, c_n) = self.lstm(x) + self.lstm_activations = lstm_out.detach() + + attention_scores = self.attention(lstm_out).squeeze(-1) + attention_weights = F.softmax(attention_scores, dim=1) + self.attention_weights = attention_weights.detach() + + context_vector = torch.bmm(attention_weights.unsqueeze(1), lstm_out).squeeze(1) + out = self.classifier(context_vector) + + if return_attention: + return out, attention_weights + return out + + def reset_activation_storage(self): + self.cnn_activations = [] + self.lstm_activations = None + self.attention_weights = None + self.gradients = None + self.last_cnn_output = None + + def interpret(self, x, class_idx=None, methods=None): + """ + Enhanced interpretation method with multiple explainability techniques + + Args: + x: Input data tensor + class_idx: Target class indices to explain (defaults to predicted class) + methods: List of methods to use, options: ['gradcam', 'integrated_gradients', + 'occlusion', 'shap', 'feature_ablation', 'all'] + + Returns: + Dictionary with various interpretability outputs + """ + if methods is None: + methods = ['gradcam', 'attention'] # Default methods + if 'all' in methods: + methods = ['gradcam', 'integrated_gradients', 'occlusion', 'shap', + 'feature_ablation', 'attention', 'layer_importance'] + + # Store original training state + was_training = self.training + lstm_was_training = self.lstm.training + + # Set model to evaluation mode for interpretability + self.eval() + self.lstm.train() # needed for CuDNN backward compatibility + + # Base prediction + x.requires_grad_() + self.input = x # Store input for interpretability methods + + logits, attention = self.forward(x, return_attention=True, track_gradients=True) + pred = torch.softmax(logits, dim=1) + + if class_idx is None: + class_idx = pred.argmax(dim=1) + + # Initialize results dictionary + results = { + 'prediction': pred.detach(), + 'class_idx': class_idx, + 'attention_weights': self.attention_weights, + } + + # Apply selected interpretability methods + if 'gradcam' in methods: + for i in range(x.shape[0]): + pred[i, class_idx[i]].backward(retain_graph=True if i < x.shape[0]-1 else False) + + results['feature_importance'] = self.get_feature_importance() + results['temporal_channel_importance'] = self.get_temporal_channel_importance() + results['channel_importance'] = self.get_channel_importance() + results['cnn_activations'] = self.cnn_activations + + # Integrated Gradients + if 'integrated_gradients' in methods: + ig = IntegratedGradients(self.forward_wrapper) + results['integrated_gradients'] = self._compute_integrated_gradients( + ig, x, class_idx) + + # Occlusion analysis + if 'occlusion' in methods: + occlusion = Occlusion(self.forward_wrapper) + results['occlusion'] = self._compute_occlusion(occlusion, x, class_idx) + + # SHAP (GradientSHAP implementation) + if 'shap' in methods: + gradient_shap = GradientShap(self.forward_wrapper) + results['gradient_shap'] = self._compute_gradient_shap(gradient_shap, x, class_idx) + + # Feature ablation (sensitivity analysis) + if 'feature_ablation' in methods: + results['feature_ablation'] = self._feature_ablation_analysis(x, class_idx) + + # Layer importance analysis + if 'layer_importance' in methods: + results['layer_importance'] = self._compute_layer_importance(x, class_idx) + + # Restore original training states + self.train(was_training) + self.lstm.train(lstm_was_training) + + # Clean up to avoid memory issues + self.input = None + torch.cuda.empty_cache() + + return results + + def forward_wrapper(self, x): + """Wrapper for Captum compatibility""" + return self.forward(x) + + def get_feature_importance(self): + """ + Grad-CAM temporal over the output of the last CNN block. + Returns tensor (batch, time) + """ + if self.gradients is None or self.last_cnn_output is None: + return None + + pooled_gradients = torch.mean(self.gradients, dim=[0, 2]) # (channels,) + cam = self.last_cnn_output.clone() + + for i in range(cam.shape[1]): + cam[:, i, :] *= pooled_gradients[i] + + heatmap = torch.mean(cam, dim=1).detach() # (batch, time) + + # Apply ReLU to highlight only positive influences + heatmap = F.relu(heatmap) + + # Normalize heatmap for better visualization + if heatmap.max() > 0: + heatmap = heatmap / heatmap.max() + + return heatmap + + def get_channel_importance(self): + """ + Channel importance: (batch, channels) + """ + if self.input is None or self.input.grad is None: + raise ValueError("Input gradients not available. Call interpret() first.") + return self.input.grad.abs().mean(dim=2).detach() + + def get_temporal_channel_importance(self): + """ + Temporal-channel importance: (batch, channels, time) + """ + if self.input is None or self.input.grad is None: + raise ValueError("Input gradients not available. Call interpret() first.") + return self.input.grad.abs().detach() + + def _compute_integrated_gradients(self, ig, x, class_idx): + """Compute integrated gradients attribution""" + batch_size = x.shape[0] + attributions = [] + + for i in range(batch_size): + baseline = torch.zeros_like(x[i:i+1]) + attr = ig.attribute( + x[i:i+1], baseline, target=class_idx[i].item(), n_steps=50 + ) + attributions.append(attr) + + return torch.cat(attributions).detach() + + def _compute_occlusion(self, occlusion_algo, x, class_idx): + """Compute occlusion-based feature attribution""" + batch_size = x.shape[0] + attributions = [] + + # Define sliding window parameters for temporal data + window_size = min(5, x.shape[2] // 4) # Adapt window size to input length + + for i in range(batch_size): + attr = occlusion_algo.attribute( + x[i:i+1], + sliding_window_shapes=(1, window_size), + target=class_idx[i].item(), + strides=(1, max(1, window_size // 2)) + ) + attributions.append(attr) + + return torch.cat(attributions).detach() + + def _compute_gradient_shap(self, shap_algo, x, class_idx): + """Compute GradientSHAP attributions""" + batch_size = x.shape[0] + attributions = [] + + for i in range(batch_size): + # Create random baselines (typically 10-50 for good estimates) + baselines = torch.randn(10, *x[i:i+1].shape[1:]) * 0.001 + + # Ensure baselines device matches input + baselines = baselines.to(x.device) + + attr = shap_algo.attribute( + x[i:i+1], baselines=baselines, target=class_idx[i].item() + ) + attributions.append(attr) + + return torch.cat(attributions).detach() + + def _feature_ablation_analysis(self, x, class_idx): + """Analyze model by systematically ablating input features""" + batch_size = x.shape[0] + results = [] + + for i in range(batch_size): + # Store original prediction + with torch.no_grad(): + orig_output = self.forward(x[i:i+1]) + orig_prob = torch.softmax(orig_output, dim=1)[0, class_idx[i]].item() + + # Test ablation of each channel + channel_importance = [] + for c in range(self.input_channels): + # Create ablated input (zero out one channel) + ablated_input = x[i:i+1].clone() + ablated_input[:, c, :] = 0 + + # Get prediction on ablated input + with torch.no_grad(): + ablated_output = self.forward(ablated_input) + ablated_prob = torch.softmax(ablated_output, dim=1)[0, class_idx[i]].item() + + # Impact is reduction in probability + channel_impact = orig_prob - ablated_prob + channel_importance.append(channel_impact) + + results.append(torch.tensor(channel_importance)) + + return torch.stack(results) + + def _compute_layer_importance(self, x, class_idx): + """Compute importance of each layer using Layer GradCAM""" + batch_size = x.shape[0] + layer_importance = {} + + # Define layers to analyze + layers = { + 'conv1': self.conv1, + 'conv2': self.conv2, + 'conv3': self.conv3 + } + + for layer_name, layer in layers.items(): + layer_gradcam = LayerGradCam(self.forward_wrapper, layer) + layer_attrs = [] + + for i in range(batch_size): + attr = layer_gradcam.attribute( + x[i:i+1], target=class_idx[i].item() + ) + # Process attribution to create a single importance score per sample + pooled_attr = torch.mean(attr, dim=1) + + layer_attrs.append(pooled_attr) + + layer_importance[layer_name] = torch.cat(layer_attrs).detach() + + return layer_importance + + def visualize_attributions(self, sample_idx, interpretations, time_axis=None, + channel_names=None, class_names=None): + """ + Visualize the various interpretation results + + Args: + sample_idx: Index of the sample to visualize + interpretations: Dictionary returned by interpret() method + time_axis: Optional array/list with time points for x-axis + channel_names: Optional list of channel names + class_names: Optional list of class names + """ + if not channel_names: + channel_names = [f'Channel {i}' for i in range(self.input_channels)] + + if not class_names: + class_idx = interpretations['class_idx'][sample_idx].item() + class_name = f'Class {class_idx}' + else: + class_idx = interpretations['class_idx'][sample_idx].item() + class_name = class_names[class_idx] + + # Set up figure + plt.figure(figsize=(15, 12)) + + # Original input visualization (top row, first column) + plt.subplot(3, 3, 1) + if self.input is not None: + input_data = self.input[sample_idx].cpu().detach().numpy() + if time_axis is not None: + for i in range(input_data.shape[0]): + plt.plot(time_axis, input_data[i], label=channel_names[i]) + else: + for i in range(input_data.shape[0]): + plt.plot(input_data[i], label=channel_names[i]) + plt.legend(loc='best') + plt.title('Input Signal') + plt.xlabel('Time') + plt.ylabel('Value') + + # GradCAM feature importance (top row, second column) + if 'feature_importance' in interpretations and interpretations['feature_importance'] is not None: + plt.subplot(3, 3, 2) + heatmap = interpretations['feature_importance'][sample_idx].cpu().numpy() + if time_axis is not None: + plt.plot(time_axis, heatmap) + else: + plt.plot(heatmap) + plt.title('GradCAM Feature Importance') + plt.xlabel('Time') + plt.ylabel('Importance') + + # Attention weights (top row, third column) + if 'attention_weights' in interpretations and interpretations['attention_weights'] is not None: + plt.subplot(3, 3, 3) + attention = interpretations['attention_weights'][sample_idx].cpu().numpy() + + if time_axis is not None: + # Need to match attention time axis to input time axis + # (account for pooling in the network) + x_points = np.linspace(time_axis[0], time_axis[-1], len(attention)) + plt.plot(x_points, attention) + else: + plt.plot(attention) + plt.title('Attention Weights') + plt.xlabel('Time') + plt.ylabel('Attention') + + # Channel importance (middle row, first column) + if 'channel_importance' in interpretations and interpretations['channel_importance'] is not None: + plt.subplot(3, 3, 4) + ch_importance = interpretations['channel_importance'][sample_idx].cpu().numpy() + plt.bar(channel_names, ch_importance) + plt.title('Channel Importance') + plt.ylabel('Importance') + plt.xticks(rotation=45) + + # Integrated Gradients (middle row, second column) + if 'integrated_gradients' in interpretations: + plt.subplot(3, 3, 5) + ig_attr = interpretations['integrated_gradients'][sample_idx].cpu().numpy() + ig_attr_mean = np.mean(ig_attr, axis=0) # Average across channels for visualization + + if time_axis is not None: + plt.plot(time_axis, ig_attr_mean) + else: + plt.plot(ig_attr_mean) + plt.title('Integrated Gradients') + plt.xlabel('Time') + plt.ylabel('Attribution') + + # Feature Ablation (middle row, third column) + if 'feature_ablation' in interpretations: + plt.subplot(3, 3, 6) + ablation_scores = interpretations['feature_ablation'][sample_idx].cpu().numpy() + plt.bar(channel_names, ablation_scores) + plt.title('Feature Ablation Impact') + plt.ylabel('Probability Change') + plt.xticks(rotation=45) + + # SHAP values (bottom row, first column) + if 'gradient_shap' in interpretations: + plt.subplot(3, 3, 7) + shap_attr = interpretations['gradient_shap'][sample_idx].cpu().numpy() + # Visualize average SHAP value over time + shap_avg = np.mean(shap_attr, axis=0) + + if time_axis is not None: + plt.plot(time_axis, shap_avg) + else: + plt.plot(shap_avg) + plt.title('GradientSHAP Values') + plt.xlabel('Time') + plt.ylabel('SHAP Value') + + # Occlusion analysis (bottom row, second column) + if 'occlusion' in interpretations: + plt.subplot(3, 3, 8) + occlusion_attr = interpretations['occlusion'][sample_idx].cpu().numpy() + occlusion_avg = np.mean(occlusion_attr, axis=0) + + if time_axis is not None: + plt.plot(time_axis, occlusion_avg) + else: + plt.plot(occlusion_avg) + plt.title('Occlusion Analysis') + plt.xlabel('Time') + plt.ylabel('Attribution') + + # Prediction summary (bottom row, third column) + plt.subplot(3, 3, 9) + pred_probs = interpretations['prediction'][sample_idx].cpu().numpy() + classes = list(range(len(pred_probs))) + if class_names: + classes = class_names + plt.bar(classes, pred_probs) + plt.title(f'Prediction: {class_name}') + plt.ylabel('Probability') + plt.ylim([0, 1]) + + plt.tight_layout() + return plt.gcf() + + def generate_interpretation_report(self, input_data, class_idx=None, + channel_names=None, class_names=None, + time_axis=None, methods='all'): + """ + Generate a comprehensive interpretation report for the given input + + Args: + input_data: Input tensor to analyze + class_idx: Target class indices (optional) + channel_names: Names of input channels (optional) + class_names: Names of output classes (optional) + time_axis: Time points for x-axis (optional) + methods: Explainability methods to use + + Returns: + Dictionary containing interpretations and visualization figure + """ + # Run all interpretation methods + interpretations = self.interpret(input_data, class_idx, methods=methods) + + # Generate visualizations for each sample + figures = [] + for i in range(input_data.shape[0]): + fig = self.visualize_attributions( + i, interpretations, + time_axis=time_axis, + channel_names=channel_names, + class_names=class_names + ) + figures.append(fig) + plt.close(fig) # Close to avoid display in notebooks + + return { + 'interpretations': interpretations, + 'figures': figures + } + + + +class CNN_LSTM_Classifier_XAI_2(nn.Module): + def __init__( + self, + input_channels=3, + num_classes=3, + cnn_channels=(16, 32, 64), + kernel_sizes=(5, 3, 3), + pool_type="max", # or 'avg' + dropout=0.1, + lstm_hidden_dim=32, + lstm_num_layers=1, + bidirectional=True, + classifier_hidden_dim=32, + attention_dim=None, # None = default: 2 * lstm_hidden_dim + ): + super(CNN_LSTM_Classifier_XAI_2, self).__init__() + + self.input_channels = input_channels + self.pool_type = pool_type + self.dropout_rate = dropout + self.bidirectional = bidirectional + self.num_directions = 2 if bidirectional else 1 + + self.cnn_activations = [] + self.lstm_activations = None + self.attention_weights = None + self.gradients = None + self.last_cnn_output = None + self.input = None + + # CNN Blocks + self.conv1 = nn.Conv1d(input_channels, cnn_channels[0], kernel_size=kernel_sizes[0], padding=kernel_sizes[0] // 2) + self.bn1 = nn.BatchNorm1d(cnn_channels[0]) + + self.conv2 = nn.Conv1d(cnn_channels[0], cnn_channels[1], kernel_size=kernel_sizes[1], padding=kernel_sizes[1] // 2) + self.bn2 = nn.BatchNorm1d(cnn_channels[1]) + + self.conv3 = nn.Conv1d(cnn_channels[1], cnn_channels[2], kernel_size=kernel_sizes[2], padding=kernel_sizes[2] // 2) + self.bn3 = nn.BatchNorm1d(cnn_channels[2]) + + self.pool = nn.MaxPool1d(kernel_size=2) if pool_type == "max" else nn.AvgPool1d(kernel_size=2) + self.dropout = nn.Dropout(dropout) + + # LSTM + self.lstm = nn.LSTM( + input_size=cnn_channels[2], + hidden_size=lstm_hidden_dim, + num_layers=lstm_num_layers, + batch_first=True, + bidirectional=bidirectional + ) + + # Attention + attention_dim = attention_dim or self.num_directions * lstm_hidden_dim + self.attention = nn.Linear(self.num_directions * lstm_hidden_dim, 1) + + # Classifier + self.classifier = nn.Sequential( + nn.Linear(self.num_directions * lstm_hidden_dim, classifier_hidden_dim), + nn.ReLU(), + nn.Dropout(dropout), + nn.Linear(classifier_hidden_dim, num_classes) + ) + + def activations_hook(self, grad): + self.gradients = grad + + def reset_activation_storage(self): + self.cnn_activations = [] + self.lstm_activations = None + self.attention_weights = None + self.gradients = None + self.last_cnn_output = None + + def forward(self, x, return_attention=False, track_gradients=False): + self.reset_activation_storage() + self.input = x + + x = self.pool(F.relu(self.bn1(self.conv1(x)))) + self.cnn_activations.append(x.detach()) + x = self.dropout(x) + + x = self.pool(F.relu(self.bn2(self.conv2(x)))) + self.cnn_activations.append(x.detach()) + x = self.dropout(x) + + x = F.relu(self.bn3(self.conv3(x))) + cnn_output = x + + if track_gradients and cnn_output.requires_grad: + cnn_output.register_hook(self.activations_hook) + + self.last_cnn_output = cnn_output + self.cnn_activations.append(cnn_output.detach()) + + x = self.pool(cnn_output) + x = self.dropout(x) + + x = x.permute(0, 2, 1) # (batch, time, features) + lstm_out, _ = self.lstm(x) + self.lstm_activations = lstm_out.detach() + + attention_scores = self.attention(lstm_out).squeeze(-1) + attention_weights = F.softmax(attention_scores, dim=1) + self.attention_weights = attention_weights.detach() + + context_vector = torch.bmm(attention_weights.unsqueeze(1), lstm_out).squeeze(1) + out = self.classifier(context_vector) + + if return_attention: + return out, attention_weights + return out + + +class CNN_LSTM_Classifier_Tunable(nn.Module): + def __init__( + self, + config: Optional[Dict] = None, + input_channels: int = 3, + seq_length: int = None, + num_classes: int = 3, + cnn_channels: Tuple[int, ...] = (16, 32, 64), + kernel_sizes: Tuple[int, ...] = (5, 3, 3), + pool_type: str = "max", # or 'avg' + pool_sizes: Tuple[int, ...] = (2, 2, 2), + use_batch_norm: bool = True, + activation: str = "relu", # "relu", "leaky_relu", "elu", "gelu" + dropout: float = 0.1, + cnn_dropout: Optional[float] = None, # Separate dropout for CNN + lstm_hidden_dim: int = 32, + lstm_num_layers: int = 1, + bidirectional: bool = True, + lstm_dropout: Optional[float] = None, # Separate dropout for LSTM + classifier_hidden_dims: List[int] = [32], # Multiple hidden layers + attention_dim: Optional[int] = None, # None = default: 2 * lstm_hidden_dim + attention_type: str = "basic", # "basic", "scaled_dot", "multi_head" + multi_head_num: int = 4, # For multi-head attention + residual_connections: bool = False, + layer_normalization: bool = False, + weight_init: str = "default", # "default", "xavier", "kaiming" + ): + """ + Enhanced CNN-LSTM model with attention mechanism designed for tuning flexibility. + + Args: + config: Optional dictionary with all hyperparameters to override other arguments + input_channels: Number of input channels + seq_length: Length of input sequence (needed for some operations) + num_classes: Number of output classes + cnn_channels: Tuple of CNN output channels for each layer + kernel_sizes: Tuple of kernel sizes for each CNN layer + pool_type: Pooling type ("max" or "avg") + pool_sizes: Pooling sizes for each layer + use_batch_norm: Whether to use batch normalization + activation: Activation function type + dropout: Default dropout rate + cnn_dropout: CNN-specific dropout (if None, uses dropout) + lstm_hidden_dim: LSTM hidden dimension + lstm_num_layers: Number of LSTM layers + bidirectional: Whether LSTM is bidirectional + lstm_dropout: LSTM-specific dropout (if None, uses dropout) + classifier_hidden_dims: List of hidden dimensions for classifier + attention_dim: Attention dimension + attention_type: Type of attention mechanism + multi_head_num: Number of heads for multi-head attention + residual_connections: Whether to use residual connections + layer_normalization: Whether to use layer normalization + weight_init: Weight initialization strategy + """ + super(CNN_LSTM_Classifier_Tunable, self).__init__() + + # Override with config if provided + if config is not None: + # Set all attributes from config + for key, value in config.items(): + if hasattr(self, key): + setattr(self, key, value) + elif key in locals(): + locals()[key] = value + + # Store parameters + self.input_channels = input_channels + self.seq_length = seq_length + self.num_classes = num_classes + self.cnn_channels = cnn_channels + self.kernel_sizes = kernel_sizes + self.pool_type = pool_type + self.pool_sizes = pool_sizes + self.use_batch_norm = use_batch_norm + self.activation_type = activation + self.dropout_rate = dropout + self.cnn_dropout_rate = cnn_dropout if cnn_dropout is not None else dropout + self.lstm_hidden_dim = lstm_hidden_dim + self.lstm_num_layers = lstm_num_layers + self.bidirectional = bidirectional + self.lstm_dropout_rate = lstm_dropout if lstm_dropout is not None else dropout + self.classifier_hidden_dims = classifier_hidden_dims + self.residual_connections = residual_connections + self.layer_normalization = layer_normalization + self.weight_init = weight_init + self.attention_type = attention_type + self.multi_head_num = multi_head_num + + # Calculate directions + self.num_directions = 2 if bidirectional else 1 + + # Default attention dimension if not provided + self.attention_dim = attention_dim or self.num_directions * lstm_hidden_dim + + # Precalcular dimensiones de secuencia después de las capas CNN + self.input_seq_length = seq_length + self.output_seq_length = None + + if seq_length is not None: + # Calcular reducción de secuencia por pooling + seq_reduction = 1 + current_length = seq_length + + for pool_size in self.pool_sizes: + current_length = (current_length + pool_size - 1) // pool_size # Ceil division + seq_reduction *= pool_size + + self.output_seq_length = current_length + + # Verificar dimensiones válidas + if self.output_seq_length <= 0: + raise ValueError(f"La secuencia resultante después del pooling es demasiado corta. " + f"Secuencia entrada: {seq_length}, reducción: {seq_reduction}") + + # For visualization and explanation + self.cnn_activations = [] + self.lstm_activations = None + self.attention_weights = None + self.gradients = None + self.last_cnn_output = None + self.input = None + + # Create activation function + self.activation = self._get_activation() + + # Create CNN layers + self.cnn_blocks = nn.ModuleList() + in_channels = input_channels + + for i, (out_channels, kernel_size, pool_size) in enumerate(zip(cnn_channels, kernel_sizes, pool_sizes)): + block = nn.ModuleDict() + block["conv"] = nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, padding=kernel_size // 2) + + if use_batch_norm: + block["bn"] = nn.BatchNorm1d(out_channels) + + if layer_normalization: + # Usamos LayerNorm correctamente para normalizar sobre la dimensión de características + block["ln"] = nn.LayerNorm([out_channels]) + + if pool_type == "max": + block["pool"] = nn.MaxPool1d(kernel_size=pool_size) + else: + block["pool"] = nn.AvgPool1d(kernel_size=pool_size) + + block["dropout"] = nn.Dropout(self.cnn_dropout_rate) + + # Determinar si este bloque puede usar conexión residual + # Solo si tienen mismas dimensiones de entrada y salida + if residual_connections and in_channels == out_channels: + block["has_residual"] = True + else: + block["has_residual"] = False + + self.cnn_blocks.append(block) + in_channels = out_channels + + # LSTM layer + self.lstm = nn.LSTM( + input_size=cnn_channels[-1], + hidden_size=lstm_hidden_dim, + num_layers=lstm_num_layers, + batch_first=True, + bidirectional=bidirectional, + dropout=self.lstm_dropout_rate if lstm_num_layers > 1 else 0 + ) + + # Attention mechanism + lstm_output_dim = self.num_directions * lstm_hidden_dim + if attention_type == "basic": + self.attention = nn.Linear(lstm_output_dim, 1) + elif attention_type == "scaled_dot": + self.query = nn.Linear(lstm_output_dim, self.attention_dim) + self.key = nn.Linear(lstm_output_dim, self.attention_dim) + self.value = nn.Linear(lstm_output_dim, lstm_output_dim) + elif attention_type == "multi_head": + self.mha = nn.MultiheadAttention( + embed_dim=lstm_output_dim, + num_heads=multi_head_num, + batch_first=True + ) + self.attention_ln = nn.LayerNorm(lstm_output_dim) + else: + # Caso por defecto para evitar errores + self.attention = nn.Linear(lstm_output_dim, 1) + print(f"ADVERTENCIA: Tipo de atención '{attention_type}' no reconocido. Usando 'basic'.") + + # Classifier + classifier_layers = [] + in_dim = lstm_output_dim + + for hidden_dim in classifier_hidden_dims: + classifier_layers.append(nn.Linear(in_dim, hidden_dim)) + classifier_layers.append(self.activation) + classifier_layers.append(nn.Dropout(dropout)) + in_dim = hidden_dim + + classifier_layers.append(nn.Linear(in_dim, num_classes)) + self.classifier = nn.Sequential(*classifier_layers) + + # Initialize weights + self._initialize_weights() + + + def _get_activation(self): + if self.activation_type == "relu": + return nn.ReLU() + elif self.activation_type == "leaky_relu": + return nn.LeakyReLU(0.1) + elif self.activation_type == "elu": + return nn.ELU() + elif self.activation_type == "gelu": + return nn.GELU() + else: + return nn.ReLU() + + def _initialize_weights(self): + if self.weight_init == "xavier": + for m in self.modules(): + if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif self.weight_init == "kaiming": + for m in self.modules(): + if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + if m.bias is not None: + nn.init.zeros_(m.bias) + + def activations_hook(self, grad): + self.gradients = grad + + def reset_activation_storage(self): + self.cnn_activations = [] + self.lstm_activations = None + self.attention_weights = None + self.gradients = None + self.last_cnn_output = None + + def forward(self, x, return_attention=False, track_gradients=False): + """ + Forward pass del modelo + + Args: + x: Input tensor de forma (batch, channels, seq_length) + return_attention: Si True, devuelve también weights de atención + track_gradients: Si True, registra hook para gradientes (para explicabilidad) + + Returns: + Logits de clasificación y opcionalmente pesos de atención + """ + self.reset_activation_storage() + self.input = x + + # Guardar dimensiones originales para debugging + batch_size, channels, orig_seq_len = x.shape + + # Verificar entrada coherente con la configuración del modelo + if channels != self.input_channels: + print(f"ADVERTENCIA: Número de canales de entrada ({channels}) " + f"difiere del configurado ({self.input_channels})") + + # Almacenar dimensiones después de cada bloque para debugging + dims_after_each_block = [] + + # Process through CNN blocks + for i, block in enumerate(self.cnn_blocks): + # Guardar entrada para posible conexión residual + residual = x if block["has_residual"] else None + + # Forward pass por operaciones del bloque + x = block["conv"](x) + + if "bn" in block: + x = block["bn"](x) + + if "ln" in block: + # Transponemos correctamente para aplicar layer norm + x_transposed = x.transpose(1, 2) # (batch, seq, channels) + x_normalized = block["ln"](x_transposed) + x = x_normalized.transpose(1, 2) # Volver a (batch, channels, seq) + + x = self.activation(x) + + # Aplicar conexión residual si está disponible para este bloque + if residual is not None: + x = x + residual + + # Aplicar pooling (con la lógica corregida) + if not (i == len(self.cnn_blocks) - 1 and self.residual_connections): + x = block["pool"](x) + + x = block["dropout"](x) + self.cnn_activations.append(x.detach()) + + # Guardar dimensiones actuales + dims_after_each_block.append(tuple(x.shape)) + + cnn_output = x + + # Verificar dimensiones finales CNN + final_seq_len = x.shape[2] + if self.output_seq_length is not None and final_seq_len != self.output_seq_length: + print(f"ADVERTENCIA: Longitud secuencia después de CNN ({final_seq_len}) " + f"difiere de la esperada ({self.output_seq_length})") + + if track_gradients and cnn_output.requires_grad: + cnn_output.register_hook(self.activations_hook) + + self.last_cnn_output = cnn_output + + # Reshape for LSTM: (batch, channels, seq) -> (batch, seq, channels) + x = cnn_output.permute(0, 2, 1) + + # LSTM + lstm_out, _ = self.lstm(x) + self.lstm_activations = lstm_out.detach() + + # Declarar vectores que usaremos para todos los tipos de atención + attention_weights = None + context_vector = None + + # Apply attention mechanism según tipo configurado + if self.attention_type == "basic": + attention_scores = self.attention(lstm_out).squeeze(-1) + attention_weights = F.softmax(attention_scores, dim=1) + context_vector = torch.bmm(attention_weights.unsqueeze(1), lstm_out).squeeze(1) + + elif self.attention_type == "scaled_dot": + Q = self.query(lstm_out) + K = self.key(lstm_out) + V = self.value(lstm_out) + + scores = torch.bmm(Q, K.transpose(1, 2)) / np.sqrt(self.attention_dim) + attention_weights = F.softmax(scores, dim=-1) + context_vector = torch.bmm(attention_weights, V).mean(dim=1) + + elif self.attention_type == "multi_head": + # MultiheadAttention devuelve (attn_output, attn_output_weights) + attn_output, attn_output_weights = self.mha(lstm_out, lstm_out, lstm_out) + attention_weights = attn_output_weights + + if self.layer_normalization: + attn_output = self.attention_ln(attn_output + lstm_out) + + context_vector = attn_output.mean(dim=1) + else: + # Caso por defecto: atención uniforme + attention_weights = torch.ones(lstm_out.shape[0], lstm_out.shape[1]).to(lstm_out.device) + attention_weights = attention_weights / lstm_out.shape[1] # Normalizar + context_vector = lstm_out.mean(dim=1) + + # Verificar que attention_weights existe + if attention_weights is None: + attention_weights = torch.ones(lstm_out.shape[0], lstm_out.shape[1]).to(lstm_out.device) + attention_weights = attention_weights / lstm_out.shape[1] # Normalizar + + # Guardar pesos de atención para visualización/interpretación + self.attention_weights = attention_weights.detach() + + # Verificar dimensiones del vector de contexto + if context_vector is None: + context_vector = lstm_out.mean(dim=1) + + # Asegurar dimensionalidad correcta: (batch_size, features) + if len(context_vector.shape) > 2: + print(f"ADVERTENCIA: Vector contexto tiene forma inesperada {context_vector.shape}. " + f"Aplicando mean en dim 1.") + context_vector = context_vector.mean(dim=1) + elif len(context_vector.shape) == 1: + context_vector = context_vector.unsqueeze(0) + + # Verificación final + if len(context_vector.shape) != 2: + print(f"ERROR: Vector contexto debe ser 2D pero es {context_vector.shape}") + # Intentar corregir + if len(context_vector.shape) > 2: + context_vector = context_vector.reshape(batch_size, -1) + + # Classification + out = self.classifier(context_vector) + + if return_attention: + return out, attention_weights + return out + + def get_config(self): + """Returns the current configuration as a dictionary""" + return { + "input_channels": self.input_channels, + "seq_length": self.seq_length, + "num_classes": self.num_classes, + "cnn_channels": self.cnn_channels, + "kernel_sizes": self.kernel_sizes, + "pool_type": self.pool_type, + "pool_sizes": self.pool_sizes, + "use_batch_norm": self.use_batch_norm, + "activation": self.activation_type, + "dropout": self.dropout_rate, + "cnn_dropout": self.cnn_dropout_rate, + "lstm_hidden_dim": self.lstm_hidden_dim, + "lstm_num_layers": self.lstm_num_layers, + "bidirectional": self.bidirectional, + "lstm_dropout": self.lstm_dropout_rate, + "classifier_hidden_dims": self.classifier_hidden_dims, + "attention_dim": self.attention_dim, + "attention_type": self.attention_type, + "multi_head_num": self.multi_head_num, + "residual_connections": self.residual_connections, + "layer_normalization": self.layer_normalization, + "weight_init": self.weight_init + } + + def count_parameters(self): + """Count and return the number of trainable parameters""" + return sum(p.numel() for p in self.parameters() if p.requires_grad) + + def get_intermediate_outputs(self, x): + """Get all intermediate activations for a given input""" + _ = self.forward(x, track_gradients=True) + return { + "cnn_activations": self.cnn_activations, + "lstm_activations": self.lstm_activations, + "attention_weights": self.attention_weights + } + + def visualize_attention(self, x, return_fig=False): + """Visualize attention weights for a given input""" + try: + import matplotlib.pyplot as plt + + _, attention_weights = self.forward(x, return_attention=True) + + if attention_weights is None: + print("No attention weights available") + return None + + batch_size = attention_weights.size(0) + seq_len = attention_weights.size(1) + + fig, axes = plt.subplots(batch_size, 1, figsize=(10, 2*batch_size)) + if batch_size == 1: + axes = [axes] + + for i, ax in enumerate(axes): + weights = attention_weights[i].cpu().detach().numpy() + ax.bar(range(seq_len), weights) + ax.set_title(f"Sample {i+1}") + ax.set_xlabel("Sequence position") + ax.set_ylabel("Attention weight") + + plt.tight_layout() + + if return_fig: + return fig + plt.show() + return None + except ImportError: + print("matplotlib is required for visualization") + return None + + def interpret(self, x, class_idx=None, methods=None): + """ + Enhanced interpretation method with multiple explainability techniques + + Args: + x: Input data tensor + class_idx: Target class indices to explain (defaults to predicted class) + methods: List of methods to use, options: ['gradcam', 'integrated_gradients', + 'occlusion', 'shap', 'feature_ablation', 'all'] + + Returns: + Dictionary with various interpretability outputs + """ + try: + # Try to import Captum components + from captum.attr import IntegratedGradients, Occlusion, GradientShap, LayerGradCam + except ImportError: + raise ImportError("This method requires the 'captum' package. Install with: pip install captum") + + if methods is None: + methods = ['gradcam', 'attention'] # Default methods + if 'all' in methods: + methods = ['gradcam', 'integrated_gradients', 'occlusion', 'shap', + 'feature_ablation', 'attention', 'layer_importance'] + + # Store original training state + was_training = self.training + lstm_was_training = self.lstm.training + + # Set model to evaluation mode for interpretability + self.eval() + self.lstm.train() # needed for CuDNN backward compatibility + + # Base prediction + x.requires_grad_() + self.input = x # Store input for interpretability methods + + logits, attention = self.forward(x, return_attention=True, track_gradients=True) + pred = torch.softmax(logits, dim=1) + + if class_idx is None: + class_idx = pred.argmax(dim=1) + + # Initialize results dictionary + results = { + 'prediction': pred.detach(), + 'class_idx': class_idx, + 'attention_weights': self.attention_weights, + } + + # Apply selected interpretability methods + if 'gradcam' in methods: + for i in range(x.shape[0]): + pred[i, class_idx[i]].backward(retain_graph=True if i < x.shape[0]-1 else False) + + results['feature_importance'] = self.get_feature_importance() + results['temporal_channel_importance'] = self.get_temporal_channel_importance() + results['channel_importance'] = self.get_channel_importance() + results['cnn_activations'] = self.cnn_activations + + # Integrated Gradients + if 'integrated_gradients' in methods: + ig = IntegratedGradients(self.forward_wrapper) + results['integrated_gradients'] = self._compute_integrated_gradients( + ig, x, class_idx) + + # Occlusion analysis + if 'occlusion' in methods: + occlusion = Occlusion(self.forward_wrapper) + results['occlusion'] = self._compute_occlusion(occlusion, x, class_idx) + + # SHAP (GradientSHAP implementation) + if 'shap' in methods: + gradient_shap = GradientShap(self.forward_wrapper) + results['gradient_shap'] = self._compute_gradient_shap(gradient_shap, x, class_idx) + + # Feature ablation (sensitivity analysis) + if 'feature_ablation' in methods: + results['feature_ablation'] = self._feature_ablation_analysis(x, class_idx) + + # Layer importance analysis + if 'layer_importance' in methods: + results['layer_importance'] = self._compute_layer_importance(x, class_idx) + + # Restore original training states + self.train(was_training) + self.lstm.train(lstm_was_training) + + # Clean up to avoid memory issues + self.input = None + torch.cuda.empty_cache() + + return results + + def forward_wrapper(self, x): + """Wrapper for Captum compatibility""" + return self.forward(x) + + def get_feature_importance(self): + """ + Grad-CAM temporal over the output of the last CNN block. + Returns tensor (batch, time) + """ + if self.gradients is None or self.last_cnn_output is None: + return None + + pooled_gradients = torch.mean(self.gradients, dim=[0, 2]) # (channels,) + cam = self.last_cnn_output.clone() + + for i in range(cam.shape[1]): + cam[:, i, :] *= pooled_gradients[i] + + heatmap = torch.mean(cam, dim=1).detach() # (batch, time) + + # Apply ReLU to highlight only positive influences + heatmap = F.relu(heatmap) + + # Normalize heatmap for better visualization + if heatmap.max() > 0: + heatmap = heatmap / heatmap.max() + + return heatmap + + def get_channel_importance(self): + """ + Channel importance: (batch, channels) + """ + if self.input is None or self.input.grad is None: + raise ValueError("Input gradients not available. Call interpret() first.") + return self.input.grad.abs().mean(dim=2).detach() + + def get_temporal_channel_importance(self): + """ + Temporal-channel importance: (batch, channels, time) + """ + if self.input is None or self.input.grad is None: + raise ValueError("Input gradients not available. Call interpret() first.") + return self.input.grad.abs().detach() + + def _compute_integrated_gradients(self, ig, x, class_idx): + """Compute integrated gradients attribution""" + batch_size = x.shape[0] + attributions = [] + + for i in range(batch_size): + baseline = torch.zeros_like(x[i:i+1]) + attr = ig.attribute( + x[i:i+1], baseline, target=class_idx[i].item(), n_steps=50 + ) + attributions.append(attr) + + return torch.cat(attributions).detach() + + def _compute_occlusion(self, occlusion_algo, x, class_idx): + """Compute occlusion-based feature attribution""" + batch_size = x.shape[0] + attributions = [] + + # Define sliding window parameters for temporal data + window_size = min(5, x.shape[2] // 4) # Adapt window size to input length + + for i in range(batch_size): + attr = occlusion_algo.attribute( + x[i:i+1], + sliding_window_shapes=(1, window_size), + target=class_idx[i].item(), + strides=(1, max(1, window_size // 2)) + ) + attributions.append(attr) + + return torch.cat(attributions).detach() + + def _compute_gradient_shap(self, shap_algo, x, class_idx): + """Compute GradientSHAP attributions""" + batch_size = x.shape[0] + attributions = [] + + for i in range(batch_size): + # Create random baselines (typically 10-50 for good estimates) + baselines = torch.randn(10, *x[i:i+1].shape[1:]) * 0.001 + + # Ensure baselines device matches input + baselines = baselines.to(x.device) + + attr = shap_algo.attribute( + x[i:i+1], baselines=baselines, target=class_idx[i].item() + ) + attributions.append(attr) + + return torch.cat(attributions).detach() + + def _feature_ablation_analysis(self, x, class_idx): + """Analyze model by systematically ablating input features""" + batch_size = x.shape[0] + results = [] + + for i in range(batch_size): + # Store original prediction + with torch.no_grad(): + orig_output = self.forward(x[i:i+1]) + orig_prob = torch.softmax(orig_output, dim=1)[0, class_idx[i]].item() + + # Test ablation of each channel + channel_importance = [] + for c in range(self.input_channels): + # Create ablated input (zero out one channel) + ablated_input = x[i:i+1].clone() + ablated_input[:, c, :] = 0 + + # Get prediction on ablated input + with torch.no_grad(): + ablated_output = self.forward(ablated_input) + ablated_prob = torch.softmax(ablated_output, dim=1)[0, class_idx[i]].item() + + # Impact is reduction in probability + channel_impact = orig_prob - ablated_prob + channel_importance.append(channel_impact) + + results.append(torch.tensor(channel_importance)) + + return torch.stack(results) + + def _compute_layer_importance(self, x, class_idx): + """Compute importance of each layer using Layer GradCAM""" + try: + from captum.attr import LayerGradCam + except ImportError: + raise ImportError("This method requires the 'captum' package.") + + batch_size = x.shape[0] + layer_importance = {} + + # Define layers to analyze - adapted for our new ModuleList structure + layers = {} + for i, block in enumerate(self.cnn_blocks): + layers[f'conv{i+1}'] = block['conv'] + + for layer_name, layer in layers.items(): + layer_gradcam = LayerGradCam(self.forward_wrapper, layer) + layer_attrs = [] + + for i in range(batch_size): + attr = layer_gradcam.attribute( + x[i:i+1], target=class_idx[i].item() + ) + # Process attribution to create a single importance score per sample + pooled_attr = torch.mean(attr, dim=1) + + layer_attrs.append(pooled_attr) + + layer_importance[layer_name] = torch.cat(layer_attrs).detach() + + return layer_importance + + def visualize_attributions(self, sample_idx, interpretations, time_axis=None, + channel_names=None, class_names=None): + """ + Visualize the various interpretation results + + Args: + sample_idx: Index of the sample to visualize + interpretations: Dictionary returned by interpret() method + time_axis: Optional array/list with time points for x-axis + channel_names: Optional list of channel names + class_names: Optional list of class names + """ + try: + import matplotlib.pyplot as plt + import numpy as np + except ImportError: + raise ImportError("This method requires matplotlib and numpy for visualization") + + if not channel_names: + channel_names = [f'Channel {i}' for i in range(self.input_channels)] + + if not class_names: + class_idx = interpretations['class_idx'][sample_idx].item() + class_name = f'Class {class_idx}' + else: + class_idx = interpretations['class_idx'][sample_idx].item() + class_name = class_names[class_idx] + + # Set up figure + plt.figure(figsize=(15, 12)) + + # Original input visualization (top row, first column) + plt.subplot(3, 3, 1) + if self.input is not None: + input_data = self.input[sample_idx].cpu().detach().numpy() + if time_axis is not None: + for i in range(input_data.shape[0]): + plt.plot(time_axis, input_data[i], label=channel_names[i]) + else: + for i in range(input_data.shape[0]): + plt.plot(input_data[i], label=channel_names[i]) + plt.legend(loc='best') + plt.title('Input Signal') + plt.xlabel('Time') + plt.ylabel('Value') + + # GradCAM feature importance (top row, second column) + if 'feature_importance' in interpretations and interpretations['feature_importance'] is not None: + plt.subplot(3, 3, 2) + heatmap = interpretations['feature_importance'][sample_idx].cpu().numpy() + if time_axis is not None: + plt.plot(time_axis, heatmap) + else: + plt.plot(heatmap) + plt.title('GradCAM Feature Importance') + plt.xlabel('Time') + plt.ylabel('Importance') + + # Attention weights (top row, third column) + if 'attention_weights' in interpretations and interpretations['attention_weights'] is not None: + plt.subplot(3, 3, 3) + attention = interpretations['attention_weights'][sample_idx].cpu().numpy() + + if time_axis is not None: + # Need to match attention time axis to input time axis + # (account for pooling in the network) + x_points = np.linspace(time_axis[0], time_axis[-1], len(attention)) + plt.plot(x_points, attention) + else: + plt.plot(attention) + plt.title('Attention Weights') + plt.xlabel('Time') + plt.ylabel('Attention') + + # Channel importance (middle row, first column) + if 'channel_importance' in interpretations and interpretations['channel_importance'] is not None: + plt.subplot(3, 3, 4) + ch_importance = interpretations['channel_importance'][sample_idx].cpu().numpy() + plt.bar(channel_names, ch_importance) + plt.title('Channel Importance') + plt.ylabel('Importance') + plt.xticks(rotation=45) + + # Integrated Gradients (middle row, second column) + if 'integrated_gradients' in interpretations: + plt.subplot(3, 3, 5) + ig_attr = interpretations['integrated_gradients'][sample_idx].cpu().numpy() + ig_attr_mean = np.mean(ig_attr, axis=0) # Average across channels for visualization + + if time_axis is not None: + plt.plot(time_axis, ig_attr_mean) + else: + plt.plot(ig_attr_mean) + plt.title('Integrated Gradients') + plt.xlabel('Time') + plt.ylabel('Attribution') + + # Feature Ablation (middle row, third column) + if 'feature_ablation' in interpretations: + plt.subplot(3, 3, 6) + ablation_scores = interpretations['feature_ablation'][sample_idx].cpu().numpy() + plt.bar(channel_names, ablation_scores) + plt.title('Feature Ablation Impact') + plt.ylabel('Probability Change') + plt.xticks(rotation=45) + + # SHAP values (bottom row, first column) + if 'gradient_shap' in interpretations: + plt.subplot(3, 3, 7) + shap_attr = interpretations['gradient_shap'][sample_idx].cpu().numpy() + # Visualize average SHAP value over time + shap_avg = np.mean(shap_attr, axis=0) + + if time_axis is not None: + plt.plot(time_axis, shap_avg) + else: + plt.plot(shap_avg) + plt.title('GradientSHAP Values') + plt.xlabel('Time') + plt.ylabel('SHAP Value') + + # Occlusion analysis (bottom row, second column) + if 'occlusion' in interpretations: + plt.subplot(3, 3, 8) + occlusion_attr = interpretations['occlusion'][sample_idx].cpu().numpy() + occlusion_avg = np.mean(occlusion_attr, axis=0) + + if time_axis is not None: + plt.plot(time_axis, occlusion_avg) + else: + plt.plot(occlusion_avg) + plt.title('Occlusion Analysis') + plt.xlabel('Time') + plt.ylabel('Attribution') + + # Prediction summary (bottom row, third column) + plt.subplot(3, 3, 9) + pred_probs = interpretations['prediction'][sample_idx].cpu().numpy() + classes = list(range(len(pred_probs))) + if class_names: + classes = class_names + plt.bar(classes, pred_probs) + plt.title(f'Prediction: {class_name}') + plt.ylabel('Probability') + plt.ylim([0, 1]) + + plt.tight_layout() + return plt.gcf() + + +# Example of creating a model with custom hyperparameters +def create_model_with_config(**kwargs): + """Helper function to create a model with specified config""" + config = { + "input_channels": 3, + "num_classes": 3, + "cnn_channels": (16, 32, 64), + "kernel_sizes": (5, 3, 3), + "pool_type": "max", + "dropout": 0.1, + "lstm_hidden_dim": 32, + "lstm_num_layers": 1, + "bidirectional": True, + "classifier_hidden_dims": [32], + "attention_type": "basic", + "residual_connections": False, + "layer_normalization": False + } + + # Update config with provided kwargs + config.update(kwargs) + + return CNN_LSTM_Classifier_Tunable(config=config) + From 1aaa8f0ce6af0299807738d7ca346cb6cf9cb92c Mon Sep 17 00:00:00 2001 From: dcajal Date: Mon, 2 Mar 2026 17:48:02 +0100 Subject: [PATCH 04/38] Add some processing scripts and methods --- src/lib/EEG_functions.py | 315 ++++++++++++++++ src/lib/Resp_features.py | 221 ++++++++++++ src/lib/peakedness.py | 634 +++++++++++++++++++++++++++++++++ src/scripts/EEG_Segmenting.py | 193 ++++++++++ src/scripts/EEG_processing.py | 189 ++++++++++ src/scripts/Resp_processing.py | 114 ++++++ src/scripts/ResultsAnalysis.py | 67 ++++ 7 files changed, 1733 insertions(+) create mode 100644 src/lib/EEG_functions.py create mode 100644 src/lib/Resp_features.py create mode 100644 src/lib/peakedness.py create mode 100644 src/scripts/EEG_Segmenting.py create mode 100644 src/scripts/EEG_processing.py create mode 100644 src/scripts/Resp_processing.py create mode 100644 src/scripts/ResultsAnalysis.py diff --git a/src/lib/EEG_functions.py b/src/lib/EEG_functions.py new file mode 100644 index 0000000..5427f60 --- /dev/null +++ b/src/lib/EEG_functions.py @@ -0,0 +1,315 @@ +from scipy.signal import butter, filtfilt +import numpy as np +import plotly.graph_objects as go +from plotly.subplots import make_subplots +from scipy.signal import welch +import pandas as pd +from scipy import signal +from scipy.stats import kurtosis, entropy +import matplotlib.pyplot as plt + +def butter_bandpass_filter(data, lowcut, highcut, fs, order=4): + nyq = 0.5 * fs # Frecuencia de Nyquist + low = lowcut / nyq + high = highcut / nyq + b, a = butter(order, [low, high], btype='bandpass') + # Usamos filtfilt para que no haya desfase en la señal + # w, h = signal.freqz(b, a, worN=8000) + # frequencies = (w * fs) / (2 * np.pi) + # plt.figure(figsize=(10, 5)) + # plt.plot(frequencies, 20 * np.log10(abs(h))) + # plt.xlim(0, highcut + 20) + # plt.ylim(-40, 5) # Para ver bien la caída + # plt.title('Respuesta Frecuencial Digital (Bandpass)') + # plt.xlabel('Frecuencia [Hz]') + # plt.ylabel('Amplitud [dB]') + # plt.grid(which='both', axis='both') + # plt.axvline(lowcut, color='red', linestyle='--', label='Lowcut') + # plt.axvline(highcut, color='red', linestyle='--', label='Highcut') + # plt.legend() + # plt.show() + y = filtfilt(b, a, data) + return y + +def plot_EEG(df, columns, fs = 200): + + fig = make_subplots(rows=len(columns), cols=1, + shared_xaxes=True, + vertical_spacing=0.02, + subplot_titles=columns) + limit = int(3000 * fs) + x = np.arange(df[0].shape[0]) / fs # Asumiendo fs=100Hz, ajusta si es diferente + downsample = 10 # Factor de downsampling para mejorar rendimiento (ajusta según necesidad) + for i, col in enumerate(columns): + fig.add_trace( + go.Scattergl(x=x[:limit:downsample], y=df[i][:limit:downsample], name=col, mode='lines'), + row=i+1, col=1 + ) + fig.update_layout( + height=900, + title_text="Polisomnografía - Canales EEG", + showlegend=False, + template="plotly_white" + ) + fig.update_xaxes(title_text="Tiempo (segundos)", row=len(columns), col=1) + fig.show() + +def plot_EEG_sel(sel, name = "EEG_plot_raw.html"): + fig = make_subplots(rows=len(sel), cols=1, + shared_xaxes=True, + vertical_spacing=0.02, + subplot_titles=[ch[1].label for ch in sel]) + + for i, (idx, sig) in enumerate(sel): + # Crear eje de tiempo en segundos + fs = sig.sampling_frequency + time = np.linspace(0, len(sig.data) / fs, len(sig.data)) + + # Añadir traza (solo mostramos los primeros 30s por defecto para no saturar el navegador) + # Puedes quitar el slice [:int(30*fs)] para ver todo, pero cuidado con el rendimiento + limit = int(3000 * fs) + # limit = len(sig.data) if limit > len(sig.data) else limit + # limit = len(sig.data) + # downsample = 10 # Factor de downsampling para mejorar rendimiento (ajusta según necesidad) + fig.add_trace( + go.Scattergl(x=time[:limit], y=sig.data[:limit], name=sig.label, mode='lines'), + row=i+1, col=1 + ) + + fig.update_layout( + height=900, + title_text="Polisomnografía - Canales EEG", + showlegend=False, + template="plotly_white" + ) + + fig.update_xaxes(title_text="Tiempo (segundos)", row=len(sel), col=1) + fig.write_html(f"graphs/{name}.html") # Guardar como HTML para visualización interactiva + # fig.show() + +def filtering_and_normalization(sig, sig_fs): + b, a = signal.butter(4, 0.3, btype='highpass', fs=sig_fs) + sig_filtered = signal.filtfilt(b, a, sig) + b, a = signal.butter(4, 35, btype='lowpass', fs=sig_fs) + sig_filtered = signal.filtfilt(b, a, sig_filtered) + sig_filtered = normalize(sig_filtered) + return sig_filtered + +def normalize(x): + return (x - np.mean(x)) / np.std(x) + +def remove_impulse_artifacts(sig): + # Square of second derivative + aux = np.diff(np.diff(sig)) ** 2 + aux = np.insert(aux, 0, aux[0]) + aux = np.append(aux, aux[-1]) + + # Median filter threshold + wind = 999 + if aux.size < wind: + wind = aux.size + if (wind % 2) != 1: + wind = wind - 1 + mf = signal.medfilt(aux, wind) + + # Find impulses + margin = 20 + impulses = np.asarray(np.where(aux > mf + 0.005)).ravel() + for impulse in impulses: + impulses = np.append(impulses, np.arange(impulse - margin, min(impulse + margin+1, sig.size))) + impulses = np.sort(impulses) + impulses = np.unique(impulses) + impulses = impulses[impulses >= 0] + + # Remove impulses + output = sig + output[impulses] = np.nan + return output + +def clean_movement_artifacts(data, fs, threshold_z=10, window_ms=500): + """ + Identifica y limpia artefactos de gran amplitud. + + Args: + data: Array de la señal. + fs: Frecuencia de muestreo. + threshold_z: Umbral de desviaciones estándar para marcar como artefacto. + window_ms: Tiempo alrededor del artefacto a limpiar para asegurar + que eliminamos la subida y bajada del pico. + """ + cleaned_data = data.copy() + + # 1. Calcular Z-Score de la amplitud + z_scores = np.abs((data - np.mean(data)) / np.std(data)) + + # 2. Encontrar índices que superan el umbral + mask = z_scores > threshold_z + + # 3. Expandir la máscara (el movimiento suele durar un poco más que el pico) + padding = int((window_ms / 1000) * fs) + expanded_mask = np.convolve(mask, np.ones(padding), mode='same') > 0 + + # 4. Reemplazar artefactos con el valor medio (0 si está centrada) + cleaned_data[expanded_mask] = 0 + + artifacts_percentage = (np.sum(expanded_mask) / len(data)) * 100 + print(f"Artefactos eliminados: {artifacts_percentage:.2f}% de la señal.") + + return cleaned_data + +def adaptive_variance_cleaner(data, fs, win_size_ms=500, alpha=0.1, threshold=3.5): + """ + Filtro adaptativo que detecta artefactos cuando la varianza local + excede significativamente la varianza histórica adaptativa. + + Args: + data: Array de la señal (1D). + fs: Frecuencia de muestreo. + win_size_ms: Tamaño de la ventana para calcular la varianza local. + alpha: Factor de adaptación (0 a 1). Cuanto más alto, más rápido olvida el pasado. + threshold: Multiplicador de la varianza adaptativa para marcar artefacto. + """ + win_samples = int((win_size_ms / 1000) * fs) + n_samples = len(data) + cleaned_data = np.copy(data) + + # Inicializamos la varianza adaptativa con la varianza de la primera ventana + first_win = data[:win_samples] + adaptive_var = np.var(first_win) + + # Para guardar dónde detectamos artefactos + artifact_mask = np.zeros(n_samples, dtype=bool) + + # Iteramos por ventanas + for i in range(0, n_samples - win_samples, win_samples): + current_win_idx = slice(i, i + win_samples) + current_var = np.var(data[current_win_idx]) + + # Si la varianza actual es mucho mayor que la adaptativa, es un artefacto + if current_var > threshold * adaptive_var: + artifact_mask[current_win_idx] = True + cleaned_data[current_win_idx] = 0 # O podrías interpolar + # No actualizamos la varianza adaptativa con un artefacto para no "contaminarla" + else: + # Actualización adaptativa (Exponential Moving Average) + adaptive_var = alpha * current_var + (1 - alpha) * adaptive_var + + return cleaned_data, artifact_mask + +def create_epochs(data, fs, epoch_duration=30): + samples_per_epoch = int(fs * epoch_duration) + num_epochs = len(data) // samples_per_epoch + + # Recortamos la señal para que sea divisible exactamente + data_trimmed = data[:num_epochs * samples_per_epoch] + + # Reshape: (Número de épocas, Puntos por época) + epochs = data_trimmed.reshape(num_epochs, samples_per_epoch) + return epochs + +def extract_band_powers(epochs, fs, win_len = 2): + features = [] + complexities = [] + # Definición de las bandas + bands = { + 'Delta': (0.5, 4), + 'Theta': (4, 8), + 'Alpha': (8, 12), + 'Sigma': (11, 16), + 'Beta': (12, 30) + } + + for epoch in epochs: + # Calcular PSD + freqs, psd = welch(epoch, fs, nperseg=fs*30) # Ventanas de 2 seg para buena resolución + # Plot de PSD para verificar que las bandas se ven bien (opcional) + # plt.semilogy(freqs, psd) + # plt.show() + epoch_features = {} + for band_name, (low, high) in bands.items(): + # Encontrar índices de frecuencia para la banda actual + idx_band = np.logical_and(freqs >= low, freqs <= high) + # Calcular la potencia media en esa banda + epoch_features[band_name] = np.mean(psd[idx_band]) + + features.append(epoch_features) + + diff = np.diff(epoch) + mobility = np.sqrt(np.var(diff) / np.var(epoch)) + # 2. Complejidad de Hjorth: Qué tan similar es la señal a una onda senoidal + diff2 = np.diff(diff) + mobility_diff = np.sqrt(np.var(diff2) / np.var(diff)) + complexity = mobility_diff / mobility if mobility > 0 else 0 + complexities.append({'Hjorth_Mobility': mobility, 'Hjorth_Complexity': complexity}) + + return pd.DataFrame(features), pd.DataFrame(complexities) + +def get_patient_profile(df_features): + # 1. Calcular Potencia Total por época + total_power = df_features.sum(axis=1) + avg_p = df_features.mean() + total_avg_p = avg_p.sum() + + # 2. Variabilidad (Refleja microdespertares y fragmentación) + # Coeficiente de Variación (CV = std/mean) para normalizar por amplitud + variability = df_features.std() / df_features.mean() + variability.index = ['CV_' + col for col in variability.index] + + # 3. Curtosis (Picos súbitos de actividad) + kurt = df_features.apply(kurtosis) + kurt.index = ['Kurt_' + col for col in kurt.index] + + # 4. Índices de potencia relativa específicos + rel_delta = avg_p['Delta'] / total_avg_p + + # 5. Ratios de enlentecimiento + tar = avg_p['Theta'] / avg_p['Alpha'] # Theta-Alpha Ratio + tbr = avg_p['Theta'] / avg_p['Beta'] # Theta-Beta Ratio + + # 6. Entropía Espectral (Complejidad del perfil de potencia promedio) + # Cuanto más baja, más "pobre" es la diversidad de frecuencias del cerebro + spec_entropy = entropy(df_features) + + # 2. Calcular Potencias Relativas (promedio de toda la noche) + rel_powers = df_features.div(total_power, axis=0).mean() + rel_powers.index = ['Rel_' + col for col in rel_powers.index] + + # Calculate main frecuencies of oscilation on each band (peak frequency) + # Esto puede ser un buen indicador de cambios en la arquitectura del sueño + # peak_freqs = {} + # for band in ['Delta', 'Theta', 'Alpha', 'Sigma', 'Beta']: + # freqs, psd = welch(df_features[band], fs=1/30, nperseg=25, noverlap = 25 // 2, nfft=1024) # fs=1/30 porque cada punto es un promedio de 30s + # idx_peak = np.argmax(psd) + # peak_freqs['PeakFreq_' + band] = freqs[idx_peak] + + # 3. Calcular Ratios Críticos + # Usamos la media de las potencias absolutas para el ratio global + avg_p = df_features.mean() + ratios = { + 'Ratio_Theta_Alpha': avg_p['Theta'] / avg_p['Alpha'], + 'Ratio_Slow_Fast': (avg_p['Delta'] + avg_p['Theta']) / (avg_p['Alpha'] + avg_p['Beta']), + 'Sigma_Stability': df_features['Sigma'].std() / df_features['Sigma'].mean(), + 'Spectral_Entropy_delta': spec_entropy[0], + 'Spectral_Entropy_theta': spec_entropy[1], + 'Spectral_Entropy_alpha': spec_entropy[2], + 'Spectral_Entropy_sigma': spec_entropy[3], + 'Spectral_Entropy_beta': spec_entropy[4], + 'Theta_Alpha_Ratio': tar, + 'Theta_Beta_Ratio': tbr, + 'Relative_Delta_Power': rel_delta, + 'kurtosis_Delta': kurt['Kurt_Delta'], + 'kurtosis_Theta': kurt['Kurt_Theta'], + 'kurtosis_Alpha': kurt['Kurt_Alpha'], + 'kurtosis_Sigma': kurt['Kurt_Sigma'], + 'kurtosis_Beta': kurt['Kurt_Beta'], + 'variability_Delta': variability['CV_Delta'], + 'variability_Theta': variability['CV_Theta'], + 'variability_Alpha': variability['CV_Alpha'], + 'variability_Sigma': variability['CV_Sigma'], + 'variability_Beta': variability['CV_Beta'], + + } + + # Combinar todo en una sola fila + profile = pd.concat([rel_powers, pd.Series(ratios)]) + return profile \ No newline at end of file diff --git a/src/lib/Resp_features.py b/src/lib/Resp_features.py new file mode 100644 index 0000000..25c4706 --- /dev/null +++ b/src/lib/Resp_features.py @@ -0,0 +1,221 @@ +import pandas as pd +import numpy as np +import plotly.graph_objs as go +from lib.peakedness import peakednessCost +from scipy.interpolate import interp1d +import matplotlib.pyplot as plt +from scipy.stats import kruskal +from scipy.signal import resample, detrend +import scipy.fft as fft +from scipy.signal import butter, filtfilt + +def plot_resp(Data, subjet = 1, DownPrinting = 2): + """ + Plot resp data using Plotly. + """ + if type(Data) == dict: + Data = pd.DataFrame(Data[str(subjet)]) + Data = Data.iloc[::DownPrinting, :] + end = -1 + elif type(Data) == type(pd.DataFrame()): + Data = Data[Data['Subjet'] == str(subjet)] + Data = Data.iloc[::DownPrinting, :] + end = -2 + + # Data.reset_index(drop=True, inplace=True) + print(len(Data.columns)) + fig = go.Figure() + for c in Data.columns[:end]: + fig.add_trace(go.Line(x=Data.Time, y=Data[c], name = c)) + fig.update_layout(title_text='EDA Data', title_x=0.5) + + fig.show() + +def peakedness_application(Data, stage, plotflag = False, subjet = 1): + # print("Compute BR") + fs = 100 + Setup = {} + Setup["K"] = 5 + Setup["DT"] = 5 + Setup["Ts"] = 60 #interval length of Welch periodograms (s) + Setup["Tm"] = 20 #interval length of subintervals for Welch periodograms (s) + # Setup["d"] = 0.1 #interval length of subintervals for Welch periodograms (s) + Setup["Omega_r"] = np.array([5, 20])/60 #respiratory rate range in Hz + Setup["plotflag"] = plotflag + Setup["Nfft"] = np.power(2,14) + tsBR = np.arange(0,Data.shape[0]/fs,1/fs) + + if tsBR.shape[0] != Data.shape[0]: + # print(f"tsBR.shape[0]: {tsBR.shape[0]}, Data.shape[0]: {Data.shape[0]}") + tsBR = np.arange(0,Data.shape[0]/fs,1/fs)[:Data.shape[0]] + + hat_Br, Sk_Br, t_aver = peakednessCost(Data, tsBR, fs, Setup, title = stage, storeGraph = False, subjet = subjet) + # print(f"hat_Br: {hat_Br}, Sk_Br: {Sk_Br}, bar_Br: {bar_Br}, t_aver_Br: {t_aver_Br}, f_Br: {f_Br}, used_Br: {used_Br}") + + # print(hat_Br) + return hat_Br, Sk_Br, t_aver + +# Butterworth low-pass filter +def lowpass_filter(signal, fs, cutoff=2.0, order=4): + nyq = 0.5 * fs + normal_cutoff = cutoff / nyq + b, a = butter(order, normal_cutoff, btype='low', analog=False) + return filtfilt(b, a, signal) + +def Metrics_per_segment(Data): + """ + Compute peakedness per segment. + """ + + # Results = pd.DataFrame(columns=['Subject', 'Stage', 'Peakedness', 'Slope', 'Intercept', 'Relative Peak', 'Bocanada', 'Contraction', "TidalVolume", "Complexity", "Mobility", "Activity"]) + Results = pd.DataFrame() + + for subjet in Data['Subjet'].unique(): + sel_sujeto = Data[Data['Subjet'] == subjet] + sel_sujeto_ref = sel_sujeto.iloc[:,:-2] + Sol_subject = [] + Sol_interSubject = [] + for secc in sel_sujeto_ref.columns: + if secc == 'Time': + continue + else: + section = sel_sujeto[secc].values + section = section[~np.isnan(section)] + + hat_Br, Sk_Br, t_aver = peakedness_application(section, stage=secc, plotflag = False, subjet= subjet) + # print(f"Subjet: {subjet}, section: {secc} hat_Br: {hat_Br}, Sk_Br: {Sk_Br}") + + # Ajuste lineal + coef = np.polyfit(t_aver, hat_Br, 1) # Grado 1 = línea recta + pendiente, interseccion = coef + # print(f"Pendiente: {pendiente:.6f}, Intersección: {interseccion:.6f}") + + #Picudez relativa + rel_peak_list = [] + for ti in range(len(t_aver)): + f_max = np.argmax(Sk_Br[:,ti]) + rel_peak_list.append(np.sum(Sk_Br[f_max-1:f_max+1,ti]) / np.sum(Sk_Br[:,ti])) + real_peak = np.mean(rel_peak_list) + + # Derivada + diff = np.diff(section) + bocanada = max(np.percentile(diff, 90), np.abs(np.percentile(diff, 10))) + + Contraction = np.percentile(np.abs(diff), 10) + + #Tidal Volume + TidalVolume = max(np.percentile(section, 99), np.abs(np.percentile(section, 1))) + + # Calculate derivatives + dx = np.diff(section) + ddx = np.diff(dx) + + # Calculate variance and its derivatives + x_var = np.var(section) # = activity + dx_var = np.var(dx) + ddx_var = np.var(ddx) + + # Mobility and complexity + mobility = np.sqrt(dx_var / x_var) + complexity = np.sqrt(ddx_var / dx_var) / mobility + + + filtered_signal = lowpass_filter(section, 100, cutoff=2.0, order=4) + segment4Hz = resample(filtered_signal, int(filtered_signal.size/100*4)) # Resample to 4Hz + + fft_signal = fft.fft(detrend(segment4Hz), n=2**12) + power = np.abs(fft_signal)**2 + freqs = fft.fftfreq(2**12, d = 1/4) + max_freq_index = np.argmax(power) + max_freq = freqs[max_freq_index] + power_at_max_freq = power[max_freq_index-51:max_freq_index+51].sum() + power_ratio = power_at_max_freq / np.sum(power[:len(power)//2]) + + Sol = [subjet, secc[:secc.find('_')], np.mean(hat_Br), pendiente, interseccion, real_peak, bocanada, Contraction, TidalVolume, complexity, mobility, x_var, max_freq, power_ratio] + + Sol_subject.append(Sol) + + + Sol_subject = pd.DataFrame(Sol_subject, columns=['Subject', 'Stage', 'Peakedness', 'Slope', + 'Intercept', 'Relative Peak', 'Bocanada', + "Contraction","TidalVolume", "Complexity", + "Mobility", "Activity", "Max_freq", "Power_ratio"]) + + peakmean = Sol_subject['Peakedness'].mean() + peakmin = Sol_subject['Peakedness'].min() + peakmax = Sol_subject['Peakedness'].max() + + slopemean = Sol_subject['Slope'].mean() + slopemin = Sol_subject['Slope'].min() + slopemax = Sol_subject['Slope'].max() + + Rel_peak_mean = Sol_subject['Relative Peak'].mean() + + Bocanada_max = Sol_subject['Bocanada'].max() + Contraction_max = Sol_subject['Contraction'].max() + TidalVolume_max = Sol_subject['TidalVolume'].max() + + Rel_metrics = ['Subject', 'Stage',"Peakmean", "Peakmin", "Peakmax", "Slopemean", "Slopemin", "Slopemax", "Rel_peak_mean", "Bocanada_max", "Contraction_max", "TidalVolume_max"] + # Rel_metrics = ["Peakmean", "Peakmin", "Peakmax", "Slopemean", "Slopemin", "Slopemax", "Rel_peak_mean", "Bocanada_max", "Contraction_max", "TidalVolume_max"] + + Sol_interSubject_DF = pd.DataFrame(Sol_interSubject, columns=Rel_metrics) + Sol_interSubject_DF = pd.DataFrame(Sol_interSubject, columns=Rel_metrics[2:]) + for i in Sol_subject.index: + # Sol_interSubject_DF.at[i,Rel_metrics[0]] = Sol_subject.iloc[i,0] + # Sol_interSubject_DF.at[i,Rel_metrics[1]] = Sol_subject.at[i,'Stage'] + Sol_interSubject_DF.at[i,Rel_metrics[2]] = Sol_subject.at[i,'Peakedness']/peakmean + Sol_interSubject_DF.at[i,Rel_metrics[3]] = Sol_subject.at[i,'Peakedness']/peakmin + Sol_interSubject_DF.at[i,Rel_metrics[4]] = Sol_subject.at[i,'Peakedness']/peakmax + Sol_interSubject_DF.at[i,Rel_metrics[5]] = Sol_subject.at[i,'Slope']/slopemean + Sol_interSubject_DF.at[i,Rel_metrics[6]] = Sol_subject.at[i,'Slope']/slopemin + Sol_interSubject_DF.at[i,Rel_metrics[7]] = Sol_subject.at[i,'Slope']/slopemax + Sol_interSubject_DF.at[i,Rel_metrics[8]] = Sol_subject.at[i,'Relative Peak']/Rel_peak_mean + Sol_interSubject_DF.at[i,Rel_metrics[9]] = Sol_subject.at[i,'Bocanada']/Bocanada_max + Sol_interSubject_DF.at[i,Rel_metrics[10]] = Sol_subject.at[i,"Contraction"]/Contraction_max + Sol_interSubject_DF.at[i,Rel_metrics[11]] = Sol_subject.at[i,"TidalVolume"]/TidalVolume_max + + Sol = pd.concat([Sol_subject, Sol_interSubject_DF], axis=1) + Results = pd.concat([Results, Sol], ignore_index=True) + + + + return Results + +def Significance_tests(RespData): + """ + Compute significance tests for the features. + """ + results = {} + for metrica in RespData.columns[2:]: + # print(f"Realizando prueba de Kruskal-Wallis para la métrica: {metrica}") + # Realizar la prueba de Kruskal-Wallis + estadistico, p_valor = kruskal( + np.array(RespData[RespData.Stage == "Baseline"][metrica].reset_index(drop=True)), + np.array(RespData[RespData.Stage == "LOW"][metrica].reset_index(drop=True)), + np.array(RespData[RespData.Stage == "HIGH"][metrica].reset_index(drop=True)), + np.array(RespData[RespData.Stage == "REST"][metrica].reset_index(drop=True)) + ) + + # Imprimir resultados + + # print(f"Estadístico de Kruskal-Wallis: {estadistico}") + print(f"Metrica: "+metrica+" tiene un valor p: {p_valor}") + results[metrica] = p_valor + # if p_valor < 0.05: + # print("Se rechaza la hipótesis nula: hay diferencias significativas entre los grupos.") + # else: + # print("No se rechaza la hipótesis nula: no hay diferencias significativas entre los grupos.") + + results = pd.DataFrame.from_dict(results, orient='index', columns=['p_value']) + results = results.reset_index() + results.to_excel('./Graphs/kruskal_results.xlsx', index=False) + + plt.plot(results['index'], results['p_value']) + plt.axhline(y=0.05, color='r', linestyle='--') + plt.xlabel('Métrica') + plt.ylabel('Valor p') + plt.title('Resultados de la prueba de Kruskal-Wallis') + plt.xticks(rotation=90) + plt.tight_layout() + plt.savefig('./Graphs/kruskal_results.png') + plt.show() \ No newline at end of file diff --git a/src/lib/peakedness.py b/src/lib/peakedness.py new file mode 100644 index 0000000..6323aca --- /dev/null +++ b/src/lib/peakedness.py @@ -0,0 +1,634 @@ +import pandas as pd +import numpy as np +from numpy.fft import fftshift +from numpy.fft import fft +from scipy.signal import detrend, find_peaks +import matplotlib.pyplot as plt +import plotly.graph_objs as go +from plotly import subplots +from time import time +import os + +def setParamFr(Setup): + if 'DT' not in Setup.keys(): + Setup["DT"] = 5 + DT = 5 + else: + DT = Setup["DT"] + + if 'Ts' not in Setup.keys(): + Setup["Ts"] = 42 + Ts = 42 + else: + Ts = Setup["Ts"] + + if 'Tm' not in Setup.keys(): + Setup["Tm"] = 12 + Tm = 12 + else: + Tm = Setup["Tm"] + + if 'Nfft' not in Setup.keys(): + Setup["Nfft"] = np.power(2,12) + Nfft = np.power(2,12) + else: + Nfft = Setup["Nfft"] + + if 'K' not in Setup.keys(): + Setup["K"] = 5 + K = 5 + else: + K = Setup["K"] + + if 'Omega_r' not in Setup.keys(): + Setup["Omega_r"] = np.array([0.04, 1]) + Omega_r = np.array([0.04, 1]) + else: + Omega_r = Setup["Omega_r"] + + if 'ksi_p' not in Setup.keys(): + Setup["ksi_p"] = 45 + ksi_p = 45 + else: + ksi_p = Setup["ksi_p"] + + if 'N_k' not in Setup.keys(): + Setup["N_k"] = 4 + N_k = 4 + else: + N_k = Setup["N_k"] + + if 'ksi_a' not in Setup.keys(): + Setup["ksi_a"] = 85 + ksi_a = 85 + else: + ksi_a = Setup["ksi_a"] + + if 'd' not in Setup.keys(): + Setup["d"] = 0.125 + d = 0.125 + else: + d = Setup["d"] + + if 'b' not in Setup.keys(): + Setup["b"] = 0.8 + b = 0.8 + else: + b = Setup["b"] + + if 'a' not in Setup.keys(): + Setup["a"] = 0.5 + a = 0.5 + else: + a = Setup["a"] + + if 'plotflag' not in Setup.keys(): + Setup["plotflag"] = False + plotflag =False + else: + plotflag = Setup["plotflag"] + + + return [ DT, Ts, Tm, Nfft, K, Omega_r, ksi_p, ksi_a, d, b, a, N_k, plotflag, Setup] + +def extract_interval( x, t, int_ini, int_end ): + # EXTRACT_INTERVAL Very simple function to extract an interval from a signal + # + # Created by Jesús Lázaro in 2011 + # -------- + # Sintax: [ x_int, t_int, indexes ] = extract_interval( x, t, int_ini, int_end ) + # In: x = signal + # t = time vector + # int_ini = interval begin time (same units as 't') + # int_end = interval end time (same units as 't') + # + # Out: x_int = interval [int_ini, int_end] of 'x' + # t_int = interval [int_ini, int_end] of 't' + # indexes = indexes corresponding to returned time interval + + x_int = x[(t>=int_ini) & (t <=int_end)] + t_int = t[(t>=int_ini) & (t <=int_end)] + + return [ x_int, t_int ] + +def normalizar_PSD( PSD, f = 'default', rango = 'default'): + # NORMALIZAR_PSD Normaliza una densidad espectral de potencia en el rango + # de frecuencias requerido. + # + # Created by Jesús Lázaro in 2011 + # ------- + # Sintax: [ PSD_norm, f_PSD_norm, factor_norm ] = normalizar_PSD( PSD, f, rango ) + # In: PSD = Densidad espectral de potencia + # f = Vector de frecuencias para PSD [Por defecto: frecuencias digitales] + # rango = Rango [f1, f2] en el que se aplicar� la normalizaci�n [Por defecto: Todo f] + # + # Out: PSD_norm = Densidad espectral de potencia notmalizada + # f_PSD_norm = Vector de frecuencias para PSD_norm + # factor_norm = Factor de normalizaci�n utilizado + + if f == 'default': + f = np.arange(0,PSD.shape[0]) / PSD.shape[0] - 1/2 + + if rango == 'default': + rango = [f[0], f[-1]] + + + # Seleccionar rango de inter�s: + f_PSD_norm = f[(f>=rango[0]) & (f<=rango[1])] + PSD = PSD[(f>=rango[0]) & (f<=rango[1])] + if ~f_PSD_norm.any(): # El vector de frecuencias no estaba ordenado + print('El vector de frecuencias debe estar ordenado de forma ascendente'); + + + # Calcular factor de normalizaci�n y normalizar: + ##ADD NAN removal from PSD + factor_norm = sum(PSD) + # print("factor_norm "+ str(factor_norm)) + if factor_norm == 0: + # print("stop") // IMPORTANT MODIFICATION TODO + PSD_norm = PSD + else: + PSD_norm = PSD/factor_norm + + return [ PSD_norm, f_PSD_norm, factor_norm ] + +def init_module(kk,vars,param, plotflag): + # function vars = init_module(kk,vars,param, plotflag) + # This function is used for initialization and reinitialization of bar_fr + Skl = vars["Skl"] + t_orig = vars["t_orig"] + t_aver = vars["t_aver"] + f = vars["f"] + L = vars["L"] + + DT = param["DT"] + K = param["K"] + ksi_p = param["ksi_p"] + d = param["d"] + + # Increment of number of spectra for averaging + if kk == 0: # INITIALIZATION + N = 4*np.floor(K/2) + else: # RE-INITIALIZATION + N = 2*np.floor(K/2) + + ###### Peakedness Analysis : + # Indexes of original spectra that take part in the average + O = np.bitwise_and(t_orig>=t_aver[kk]-N*DT, t_orig<=t_aver[kk]+N*DT) + W = np.arange(O.shape[0]) + O = W[O] + # W = np.ones([O.shape[0]]) + # O1 = W[O] + # Pre-allocate + Xkl = np.empty((O.shape[0], L)) + Xkl[:] = np.nan + for k in range(O.shape[0]): + for l in range(L): + S = Skl[:, O[k], l] + # print(S.shape) + # Use as reference for Pkl calculation the absolute maximum + i_m = S.argmax() + fr_max = f[i_m] + + # Define the Omega, Omega_p bands + Omega = np.bitwise_and(f>=fr_max-d, f<=fr_max+d) + + # Modified limits for initialization (reduces the risk for 0.1 Hz) + Omega_p = np.bitwise_and(f>=max(fr_max-0.4*d,0.15), f<=min(fr_max+0.4*d,0.8)) + + # Peakedness + # print(S[Omega]) + Pkl = 100*sum(S[Omega_p])/sum(S[Omega]) + + if Pkl >= ksi_p: + Xkl[k,l] = 1 + else: + Xkl[k,l] = 0 + + # Initialization for averaged spectrum (if cannot be defined) + if L>1: + averS = np.mean(np.squeeze(np.mean(Skl[:, O, :],1)),1) + else: + averS = np.mean(np.mean(Skl[:, O, :],1),1) + + + if kk == 0: #INITIALIZATION + if np.sum(Xkl[:]) > 0: # One or more spectra were peaked enough + + # Sum all peaky spectra + averS = np.zeros((f.shape[0], 1)) + for k in range(O.shape[0]): + for l in range(L): + if Xkl[k, l] == 1: + averS = averS + Skl[:, O[k], l] + + # Select the maximum in the spectrum + i_m = averS.argmax() + + # Save in vars + vars["bar_fr"][0] = f[i_m] + + else: # RE-INITIALIZATION + # One or more spectra were peaked enough + if np.sum(Xkl[:]) > 0: + # Sum all peaky spectra + averS = np.zeros(f.shape[0]) + for k in range(O.shape[0]): + for l in range(L): + if Xkl[k,l] == 1: + averS = averS + Skl[:, O[k], l] + + # Local maxima in the averaged spectrum + j_pk = find_peaks(averS) + j_pk = j_pk[0] + pk = averS[j_pk] + # Extra restriction : consider peaks with important power + # j_del = pk<0.5*np.max(averS) # IMPORTANTE TODO + j_del = pk<0.2*np.max(averS) + pk = pk[~j_del] + j_pk = j_pk[~j_del] + + # Cost function for deviation from previous fr and maximum power + C_a = 1-np.transpose(pk)/np.max(S) + fr_prev = vars["bar_fr"][np.max(kk,0)] + C_f = abs(f[j_pk[:]]-fr_prev)/(2*d) + # C_f = abs(f(i_pk(:))-fr_prev)/(Omega_r(2)-Omega(1)); + + C = C_a +C_f + try: + j_min = C.argmin() + fj = j_pk[j_min] + vars["bar_fr"][kk] = f[fj] + except: + vars["bar_fr"][kk] = 0 + # Save in vars + # vars["bar_fr"][kk] = f[fj] + + if plotflag: + plt.plot(f, averS) + plt.plot(f[fj], averS[fj], '-') + plt.title('Initialization - Averaged Spectrum') + plt.show() + + return vars + # # No spectra fulfill the initialization + # if plotflag: + # keyboard + +def compute_Xkl( Skl, f, bar_fr, O, ksi_p, ksi_a, d): + # function [ Xkl ] = compute_Xkl( Skl, f, bar_fr, O, ksi_p, ksi_a, d) + # Created by Spyros Kontaxis in 2019 + # Computation of peakedness for a power spectrum + # Sintax: [ Xkl ] = compute_Xkl( Skl, f, bar_fr, O, ksi_p, ksi_a, d) + # Inputs: + # Skl : Welch TF maps in a 3D matrix (f x t x DR signals) + # f : frequency vector (Hz) + # bar_fr : smoothed estimate of the respiratory rate (Hz) + # O : Indexes of original spectra that take part in the average + # ksi_p : peakedness threshold based on power concentration (%) + # ksi_a : peakedness threshold based on absolute maximum (%) + # d : half bandwith of Omega centered around bar_fr (Hz) + # Outputs: + # Xkl : 1-> the o:th spectrum will be used in the average + # 0-> the o:th spectrum will not be used in the average + # + + # % Define two search window arround the estimated respiratory rate + Omega = np.bitwise_and(f>=bar_fr-d, f<=bar_fr+d) + Omega_p = np.bitwise_and(f>=bar_fr-0.4*d, f<=bar_fr+0.4*d) + + # % Get the ammount of signals + L = Skl.shape[2] + + # % Pre-allocate + Xkl = np.zeros((O.shape[0],L)) + + # % Loop over all segments + for k in range(O.shape[0]): + + # % Loop over all signals + for l in range(L): + # % Select the power spectrum of one segment + S = Skl[:, O[k], l] + + # % Define peakedness based on the power concentration + Pkl = 100*sum(S[Omega_p])/sum(S[Omega]) + + # % Define peakedness based on the absolute maximum + # print(max(S)) + Akl = 100*max(S[Omega])/max(S) + # % If the spectrum is concidered peaky by both conditions, mark as + # % peaky + if np.bitwise_and(Pkl >= ksi_p, Akl >= ksi_a): + Xkl[k,l] = 1 + else: + Xkl[k,l] = 0 + + return Xkl + +def compute_fJmin( S, f, bar_fr, d): + # function [ fJmin ] = compute_fJmin( S, f, bar_fr, d) + # Created by Spyros Kontaxis in 2019 + # Spectral peak selection based on cost function + # Sintax: [ fJmin ] = compute_fJmin( S, f, bar_fr, d) + # Inputs: + # S : Averaged Spectrum + # f : frequency vector (Hz) + # bar_fr : smoothed estimate of the respiratory rate (Hz) + # d : half bandwith of Omega centered around bar_fr (Hz) + # Outputs: + # fJmin : respiratory rate estimate + # + + # Define the search window + Omega = np.bitwise_and(f >= bar_fr-d, f <= bar_fr+d) + + # Pre-allocate + fJmin = np.nan + + # Locate peaks in the search window + [peaks, properties] = find_peaks(S[Omega]) #,'SortStr','descend' + + # Put the location in the correct perspective + lm = peaks + (Omega[:] ==1).argmax() + + # Select the frequency that corresponds to the location + fJ = f[lm] + + # print(len(lm)) + if len(lm) > 0: + # Compute the cost function for deviation from previous fr and maximum power + C_f = abs(fJ-bar_fr)/(2*d) + C_a = 1-S[lm]/max(S[Omega]) + + # Select the minimum cost + C = C_f+C_a + Jmin = C.argmin() + + # Store the frequency with the minimum cost + fJmin = fJ[Jmin] + + return fJmin + +def peakednessCost(signals, ts, fs, Setup = {}, title = "", storeGraph = False, subjet =1): + + vars = {} + # Set parameters / Arrange inputs + [ DT, Ts, Tm, Nfft, K, Omega_r, ksi_p, ksi_a, d, b, a,N_k, plotflag , Setup] = setParamFr(Setup) + + # Start the time stamps at zero + ts1 = ts[0] + ts = ts-ts1 + if type(signals) == type(pd.DataFrame()): + signals = signals.to_numpy() + + # Get the number of signals + if len(signals.shape) == 1: + signals = np.reshape(signals, (signals.shape[0],1)) + if signals.shape[0]= Omega_r[0], f < Omega_r[1]) + vars["f"] = f[f_ind] + + # Time vector for original Welch periodograms + # vars["t_orig"] = np.arange(Ts/2, ts[-1]+DT,DT) - Ts/2+DT #Es posible que sea esto lo que quieren pero no es lo que sale de MATLAB + vars["t_orig"] = np.arange(Ts/2, ts[-1]- Ts/2+DT,DT) #Esto es lo que sale de MATLAB + + # Pre-allocate + vars["Skl"] = np.empty((vars["f"].shape[0], vars["t_orig"].shape[0], vars["L"])) + vars["Skl"][:] = np.nan + + t_for1 = time() + for ii in range(vars["L"]): + t_for_L = time() + + # Select signal + signal = signals[:, ii] + # signal = np.reshape(signal, (signal.shape[0],1)) + # Compute the Welch Periodgrams + for k, ki in zip(vars["t_orig"], range(vars["t_orig"].shape[0])): + # Begin of Ts seconds interval + # Ws_begin = vars["t_orig"][k] - Ts/2 + Ws_begin = k - Ts/2 + + # End of Ts seconds interval + Ws_end = Ws_begin + Ts + [int_Ts_sig, int_Ts_t] = extract_interval(signal, ts, Ws_begin, Ws_end); # Ts seconds interval + S = np.zeros((vars["f"].shape[0])) + if int_Ts_sig.shape[0] < (Tm*100)/2: + vars["Skl"][:, ki, ii] = np.zeros((vars["f"].shape[0])) + continue + # Number of Tm length subintervals + NWm = int(np.floor(2*Ts/Tm)) + I=0 + + for i_Tm in range(NWm): + S_i = [] + + # Begin of Tm seconds interval + Wm_begin = Ws_begin + (i_Tm)*Tm/2 + + # End of Tm seconds interval + Wm_end = min(Wm_begin + Tm, Ws_end) + + # Tm seconds interval + [int_Tm_sig, int_Tm_t] = extract_interval(int_Ts_sig, int_Ts_t, Wm_begin, Wm_end) + + # Estimate the spectrum only for intervals without NaNs + if ~np.isnan((int_Ts_sig.astype(float))).any(): + S_i = abs(fftshift(fft(detrend(int_Tm_sig[:-1]), Nfft)))**2 + # S_i = abs(fftshift(fft(int_Tm_sig[:-1], Nfft)))**2 + S_i = S_i[f_ind] + [ S_i, f_PSD_norm, factor_norm ] = normalizar_PSD(S_i) + if ~np.isnan(S_i).any(): + S = S + (1/NWm)*S_i #TODO hacer una median real, que si uno falla la media se coja con los otros 3 dividido entre 3 + I=I+1 + + if I < 0.5*NWm : + vars["Skl"][:, ki, ii] = np.zeros((vars["f"].shape[0])) + else: + # Define the spectrum when enough subintervals were used + vars["Skl"][:, ki, ii] = S + + + + + ##### Peak-conditioned spectral average: ###### + # Pre-allocate + N = int(np.floor(K/2)) + vars["t_aver"] = vars["t_orig"][N:-N] + if vars["t_aver"].shape[0] == 0: + print("No hay tiempo para promediar") + return np.nan, np.nan, np.nan + vars["Sk"] = np.empty((vars["f"].shape[0], vars["t_aver"].shape[0])) + vars["Sk"][:] = np.nan + vars["bar_fr"] = np.empty(( vars["t_aver"].shape[0])) + vars["bar_fr"][:] = np.nan + vars["hat_fr"] = np.empty((vars["t_aver"].shape[0])) + vars["hat_fr"][:] = np.nan + vars["Naveraged"] = np.zeros((vars["t_aver"].shape[0])) + vars["used"] = np.zeros((vars["t_aver"].shape[0],vars["L"])) + vars["times_used"] = np.zeros((vars["t_orig"].shape[0],vars["L"])) + + # Call the initialization module + k_ini = 0 + plotFlag = False + # print(vars["t_aver"]) + vars = init_module(k_ini,vars,Setup,plotFlag); #bar_fr has been initialized + + for k in np.arange(k_ini, vars["t_aver"].shape[0]): + if k >= 1: + k_prev = k-1 + else: + k_prev = 0 + + # Re-initialization when hat_fr has not been defined for N_k time instants + N_k = 2#3+1#vars["N_k"] + N_prev = np.arange(k,max(k-N_k,-1),-1) + if np.isnan(vars["hat_fr"][N_prev]).all() and k > 2: + vars = init_module(k_prev,vars,Setup,plotFlag) # bar_fr has been re-initialized + + # Peakedness Analysis: + # Indexes of original spectra that take part in the average + O = np.bitwise_and(vars["t_orig"]>=vars["t_aver"][k]-N*DT, vars["t_orig"]<=vars["t_aver"][k]+N*DT) + W = np.arange(O.shape[0]) + O = W[O] + + # Compute the peakedness of the power spectrum (1 or 0) + Xkl = compute_Xkl(vars["Skl"], vars["f"], vars["bar_fr"][k_prev], O, ksi_p, ksi_a, d) + + + if np.sum(Xkl) == 0: # No spectrum was peaked + # Store the previous respiratory frequency + vars["bar_fr"][k] = vars["bar_fr"][k_prev] + + # Compute averaged spectrum just for visualization + if vars["L"]>1: + vars["Sk"][:, k] = np.mean(np.squeeze(np.mean(vars["Skl"][:, O, :],1)),1) + else: + try: + vars["Sk"][:,k] = np.mean(vars["Skl"][:, O, :],1)[:,0] + except: + print("Cogido en el except") + print("Cogido en el except") + print("Cogido en el except") + print("Cogido en el except") + vars["Sk"][:,k] = np.nan + + else: #One or more spectra were peaked enough + # Pre-allocate + averS = np.zeros((vars["f"].shape[0])) + + for i_Tm in range(O.shape[0]): + for ii in range(vars["L"]): + if Xkl[i_Tm,ii] == 1: # If this spectrum is considered peaky + # Sum all peaky spectra + averS = averS[:] + vars["Skl"][:, O[i_Tm], ii] + + # Store the nr of peaky spectra + vars["Naveraged"][k] = vars["Naveraged"][k] + 1 + vars["used"][k,ii] = 1 + + # Compute and store the averaged spectrum + vars["Sk"][:, k] = averS/vars["Naveraged"][k] + vars["times_used"][O,:] = vars["times_used"][O,:] + Xkl + + #Spectral peak selection + fJmin = compute_fJmin( vars["Sk"][:, k], vars["f"], vars["bar_fr"][k_prev], d) + + if ~np.isnan(fJmin).any(): # Local maxima inside Omega has been found + # Update bar_fr + + vars["bar_fr"][k] = b*vars["bar_fr"][k_prev] + (1-b)*fJmin + + # Update hat_fr + if ~np.isnan(vars["hat_fr"][k_prev]).any(): + vars["hat_fr"][k]= a*vars["hat_fr"][k_prev] + (1-a)*fJmin + else: + # Use bar_fr(k-1) that always is defined, instead of hat_fr(k-1) + vars["hat_fr"][k]= a*vars["bar_fr"][k_prev] + (1-a)*fJmin + + else: # No local maxima inside Omega + # Update bar_fr + vars["bar_fr"][k] = vars["bar_fr"][k_prev] + + # Don't Update hat_fr + + t_taver = time() + + # Extra : use bar_fr to update hat_fr when was not defined for small gaps (N_k) + # Beginning of the intervals + N_k = 0 + + int_b = np.argwhere(np.isnan(vars["hat_fr"])) + int_b1 = np.append(0,int_b) + int_b = int_b[np.diff(int_b1)>1] + + # End of the intervals + int_e = np.argwhere(np.isnan(vars["hat_fr"])) + int_e1 = np.append(int_e,np.inf) + int_e = int_e[np.diff(int_e1)>1] + + if np.isnan(vars["hat_fr"][0]) and int_e.shape[0]>1: + + int_e = int_e[1:] + try: + if (int_e[0]-int_b[0])[0] < 0: + int_e = int_e[1:] + except: + print("int vacio") + + int_small = (int_e-int_b)<=(N_k-1) + + int_b = int_b[int_small] + int_e = int_e[int_small] + for i in range(int_small.sum()): + vars["hat_fr"][int_b[i]:int_e[i]+1] = vars["hat_fr"][min(int_e[i]+1,vars["hat_fr"].shape[0])] + vars["bar_fr"][int_b[i]:int_e[i]+1] = vars["bar_fr"][min(int_e[i]+1,vars["hat_fr"].shape[0])] + + + + # # Total times a signal can be used + Ntotal = K*(vars["t_orig"].shape[0] - 2) + np.sum(np.arange(1,K)) + + # Times each signal is used + Nused = np.sum(vars["times_used"], 1) + vars["percentage_used"] = 100*Nused/Ntotal + + + vars["t_aver"] = vars["t_aver"] + ts1 + vars["t_orig"] = vars["t_orig"] + ts1 + t_fin = time() + + if plotflag: + + fig = subplots.make_subplots(rows=2,shared_xaxes=True, subplot_titles=('Peak-condition averaged EDR Spectra in '+title,"EDR/RESP signals"), row_heights=[0.7, 0.3]) + + fig.add_heatmap(x=vars["t_aver"], y=vars["f"], z=vars["Sk"]/np.max(vars["Sk"]),colorscale='jet',colorbar=dict(orientation='h')) + fig.update_layout(coloraxis_showscale=False) + fig.add_trace(go.Line(x=vars["t_aver"], y=vars["hat_fr"],name = 'f\u0302_r(k)'), row = 1, col=1) + fig.add_trace(go.Line(x=vars["t_aver"],y=vars["bar_fr"],name= 'f\u0304_r(k)'), row = 1, col=1) + + fig.add_trace(go.Line(x=vars["t_aver"],y=vars["used"]), row = 1, col=1) + # fig.axis([vars.t_aver(1), vars.t_aver(end), vars.f(1), vars.f(end)]) + for i in range(signals.shape[1]): + fig.add_trace(go.Line(x=ts+ts1,y=signals[:,i],name = 'Signal '+str(i)), row = 2, col=1) + + fig.update_layout(coloraxis_showscale=False) + fig.update_yaxes(title_text="f (Hz)", row=1, col=1) + fig.update_yaxes(title_text="(n.u.)", row=2, col=1) + fig.update_xaxes(title_text="time (s)", row=2, col=1) + if storeGraph: + os.makedirs("Graphs/Peakedness/"+str(subjet), exist_ok=True) + # fig.write_image(os.path.join("Graphs", "Peakedness",str(subjet),title+".png")) + fig.write_html(os.path.join("Graphs", "Peakedness",str(subjet),title+".html")) + # fig.write_image() + else: + fig.show() + + return vars["hat_fr"], vars["Sk"], vars["t_aver"] + # return vars["hat_fr"], vars["Sk"], vars["bar_fr"],vars["t_aver"], vars["f"], vars["used"] \ No newline at end of file diff --git a/src/scripts/EEG_Segmenting.py b/src/scripts/EEG_Segmenting.py new file mode 100644 index 0000000..6ae7406 --- /dev/null +++ b/src/scripts/EEG_Segmenting.py @@ -0,0 +1,193 @@ +from binascii import Error +import numpy as np +import pandas as pd +import sys +import os +import matplotlib.pyplot as plt +import plotly.express as px +import plotly.graph_objects as go +import scipy.signal +from plotly.subplots import make_subplots +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +import lib.helper_code as helper_code +import lib.EEG_functions as EEG_functions + +for hospital in ['I0002','I0006', 'I0004','I0007','S0001']: + print(f"Procesando hospital: {hospital}") + + if hospital == 'I0002' or hospital == 'I0006' or hospital == "S0001": + datapath = 'data/training_set/Physiological_data/'+hospital + else: + datapath = 'data/supplementary_set/Physiological_data/'+hospital + + channels = pd.read_csv("notebooks/channel_table.csv") + selectEEG = channels[channels['Category'].isin(['eeg'])] + demographics = pd.read_csv(os.path.join('data/training_set', "demographics.csv")) + selectresp = channels[channels['Category'].isin(['resp'])] + selectECG = channels[channels['Category'].isin(['ecg'])] + + # Datos = pd.DataFrame(columns=['File', 'Channel', 'Sampling_Frequency', 'Duration_sec']) + lista_dir = os.listdir(datapath) + results = [] + + for file in lista_dir: + # Cargar el archivo (sustituye por tu ruta real) + edf = helper_code.edfio.read_edf(os.path.join(datapath, file)) + + id = file[9:-10] # Asumiendo que el ID es el nombre del archivo sin la extensión + selEEG = [] + selECG = [] + selResp = [] + labels = [] + data = [] + HayECG = False + for i, sig in enumerate(edf.signals): + for index in selectECG.index: + if sig.label.lower() in selectECG['Channel_Names'][index].lower(): + HayECG = True + print(f"Canal seleccionado: {sig.label}") + selECG.append([i,sig]) + labels.append(sig.label) + data.append(sig.data) # Guardar la señal ECG sin filtrar para su posterior procesamiento + break + + HayResp = False + for i, sig in enumerate(edf.signals): + for index in selectresp.index: + if sig.label.lower() in selectresp['Channel_Names'][index].lower(): + HayResp = True + fs = sig.sampling_frequency + if sig.label == "O2": + print(f"Warning: {sig.label} is detected as respiratory signal but has a sampling frequency higher than 100 Hz. Check the data.") + else: + print(f"Canal seleccionado: {sig.label}") + selResp.append([i,sig]) + labels.append(sig.label) + + data.append(sig.data) # Guardar la señal RESP sin filtrar para su posterior procesamiento + break + + # Listar canales para identificar los de interés (ej: C3-M2, O1-M2) + # print("Canales detectados:") + + HayEEG = False + for i, sig in enumerate(edf.signals): + # print(f"[{i}] {sig.label}") + # print length fs and duration + # print(f"Length: {len(sig.data)}, Sampling Frequency: {sig.sampling_frequency} Hz, Duration: {len(sig.data)/sig.sampling_frequency:.2f} seconds") + for index in selectEEG.index: + if sig.label.lower() in selectEEG['Channel_Names'][index].lower(): + print(f"Canal seleccionado: {sig.label}") + selEEG.append([i,sig]) + # labels.append(sig.label) + HayEEG = True + break + # for i in range(len(edf.signals)): + # print(f"Longitud: {edf.signals[i].data.shape}, Canal: {edf.signals[i].label}, Frecuencia de muestreo: {edf.signals[i].sampling_frequency} Hz, Duración: {len(edf.signals[i].data)/edf.signals[i].sampling_frequency:.2f} segundos") + + if HayEEG and HayECG and HayResp: + + Bipolar = pd.DataFrame() + if all(label in labels for label in ["F3", "F4", "M1", "M2"]): + Bipolar['F3-M2'] = edf.signals[edf.labels.index("F3")].data - edf.signals[edf.labels.index("M2")].data + Bipolar['F4-M1'] = edf.signals[edf.labels.index("F4")].data - edf.signals[edf.labels.index("M1")].data + if all(label in labels for label in ["C3", "C4", "M1", "M2"]): + Bipolar['C3-M2'] = edf.signals[edf.labels.index("C3")].data - edf.signals[edf.labels.index("M2")].data + Bipolar['C4-M1'] = edf.signals[edf.labels.index("C4")].data - edf.signals[edf.labels.index("M1")].data + if all(label in labels for label in ["O2", "O1", "M1", "M2"]): + Bipolar['O2-M2'] = edf.signals[edf.labels.index("O1")].data - edf.signals[edf.labels.index("M2")].data + Bipolar['O1-M1'] = edf.signals[edf.labels.index("O2")].data - edf.signals[edf.labels.index("M1")].data + + # print(f"Archivo {file} tiene ECG, RESP y EEG. Se procesará con canales bipolares.") + if not Bipolar.empty: + for col in Bipolar.columns: + # print(f"Archivo: {file}, Canal: {col}, Frecuencia de muestreo: {sig.sampling_frequency} Hz, Duración: {len(Bipolar[col])/sig.sampling_frequency:.2f} segundos") + fs = edf.signals[edf.labels.index("O2")].sampling_frequency # Asumimos que todos los canales tienen la misma frecuencia de muestreo + time = np.linspace(0, len(Bipolar[col]) / fs, len(Bipolar[col])) + fil = EEG_functions.butter_bandpass_filter(Bipolar[col], lowcut=0.3, highcut=35, fs=fs, order=4) + norm = (fil-np.mean(fil))/np.std(fil) + + data.append(norm) # Restar la media para centrar la señal + labels.append(col) + # columns = Bipolar.columns.tolist() + else: + for i, (idx, sig) in enumerate(selEEG): + # print(f"Archivo: {file}, Canal: {sig.label}, Frecuencia de muestreo: {sig.sampling_frequency} Hz, Duración: {len(sig.data)/sig.sampling_frequency:.2f} segundos") + fs = sig.sampling_frequency + time = np.linspace(0, len(sig.data) / fs, len(sig.data)) + fil = EEG_functions.butter_bandpass_filter(sig.data, lowcut=0.3, highcut=35, fs=fs, order=4) + norm = (fil-np.mean(fil))/np.std(fil) + labels.append(sig.label) + data.append(norm) # Restar la media para centrar la señal + + # columns = [selEEG[i][1].label for i in range(len(selEEG))] + + + # for i in range(len(selResp)): + # columns.append(selResp[i][1].label) + demographicsID = demographics[demographics['BDSPPatientID'] == int(id)] + print(demographicsID) + + columnashoras = [] + for elec in labels: + for h in np.floor(np.arange(0, len(sig.data) / fs / 3600, 1)): + columnashoras.append(elec + f"_h{int(h)}") + epochs5min = pd.DataFrame(columns= columnashoras) + + for i, elec in enumerate(labels): + # Check fs of the current channel + if elec == 'O2_resp': + fs = edf.signals[edf.labels.index('O2')].sampling_frequency + else: + fs = edf.signals[edf.labels.index(labels[i])].sampling_frequency + + if fs != 200: + # print(f"Warning: Sampling frequency for channel {elec} in file {file} is {fs} Hz, expected 200 Hz. Check the data.") + duration = len(data[i]) / fs + time_original = np.linspace(0, duration, len(data[i])) + + num_samples_target = int(duration * 200 ) + time_target = np.linspace(0, duration, num_samples_target) + data[i] = np.interp(time_target, time_original, data[i]) + fs = 200 # Update fs to the target sampling frequency after resampling + + # Plot comparison of original and resampled signals + # lim = 50000 + # factor = len(filtered_data[0]) / num_samples_target + # plt.figure(figsize=(12, 6)) + # plt.plot(time_target[:int(lim/factor)], resampled_data[:int(lim/factor)], label='Resampled Signal') + # plt.plot(time_original[:lim], filtered_data[i][:lim], label='Original Signal') + # plt.title(f'Original vs Resampled Signal - {elec} in {file}') + # plt.show() + + + epoch_length = 300 # Duración de cada época en segundos + # epochs = EEG_functions.create_epochs(df[elec].values, fs, epoch_duration=epoch_length) + epochs = EEG_functions.create_epochs(data[i], fs, epoch_duration=epoch_length) + + # Coger los primeros 5min de cada hora + for h in np.floor(np.arange(0, len(epochs)*epoch_length/3600, 1)): + start_epoch = int(h*3600/epoch_length) + end_epoch = int((h*3600 + 5*60)/epoch_length) + if end_epoch > len(epochs): + end_epoch = len(epochs) + c = elec+'_h'+str(int(h)) + epochs5min.loc[:, c] = epochs[start_epoch:end_epoch][0] + + + # del epochs, fs # Liberar memoria + + # Plotly sublot epochs5min.iloc[:,::8].plot() + # h = 6 + # fig = make_subplots(rows=int(epochs5min.shape[1]/8), cols=1, subplot_titles=epochs5min.columns[h::8]) + # for i in range(h, epochs5min.shape[1], 8): + # print(f"Plotting channel: {epochs5min.columns[i]}") + # fig.add_trace(go.Scatter(x=epochs5min.index, y=epochs5min.iloc[:,i], mode='lines', name=epochs5min.columns[i]), row=int(i/8)+1, col=1) + # fig.update_layout(height=3000, width=1200, title_text=f"Epochs de 5 minutos para el archivo {file}") + # fig.show() + if epochs5min.shape[1] > 0 and epochs5min.shape[0] == 60000: + epochs5min.to_parquet(os.path.join('X:/bsicos01/__comun/Physionet/Data5min', f"{id}.parquet")) + else: + print(f"Error: No channels in epochs5min for file {file}") + # stop program if no channels are processed + raise ValueError(f"No channels in epochs5min for file {file}") \ No newline at end of file diff --git a/src/scripts/EEG_processing.py b/src/scripts/EEG_processing.py new file mode 100644 index 0000000..127a32e --- /dev/null +++ b/src/scripts/EEG_processing.py @@ -0,0 +1,189 @@ +"""EEG_processing.py + +Este módulo contiene funciones para procesar datos EEG de los +hospitales incluidos en el desafío CincChallenge 2026. La principal +función definida es `MetricasHospitlal`, que recorre los archivos EDF +correspondientes a un hospital concreto, extrae las señales EEG, +las filtra, normaliza, crea épocas y calcula potencias de banda y +complejidades. Los resultados se guardan en un CSV resumen por +hospital. + +Características principales: + +- Soporta datos tanto del conjunto de entrenamiento como del + conjunto suplementario. +- Selección automática de canales EEG a partir de la tabla + `notebooks/channel_table.csv`. +- Creación de canales bipolares si están disponibles. +- Filtrado de banda 0.3-35 Hz y normalización de la señal. +- Re-muestreo a 200 Hz si fuese necesario. +- Cálculo de potencias de banda y complejidades usando + funciones auxiliares (`lib/EEG_functions.py`). +- Exportación de resultados en `results_summaryEEG_{hospital}.csv`. + +Uso típico: + +>>> from src.scripts.EEG_processing import MetricasHospitlal +>>> MetricasHospitlal('I0002') + +El módulo depende de `numpy`, `pandas`, `matplotlib`, `plotly` y de +las utilidades definidas en `lib/helper_code` y `lib/EEG_functions`. +""" + +import numpy as np +import pandas as pd +import sys +import os +import matplotlib.pyplot as plt +import plotly.express as px +import plotly.graph_objects as go +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +import lib.helper_code as helper_code +import lib.EEG_functions as EEG_functions + +def MetricasHospitlal(hospital): + + print(f"Procesando hospital: {hospital}") + + if hospital == 'I0002' or hospital == 'I0006' or hospital == "S0001": + datapath = 'data/training_set/Physiological_data/'+hospital + else: + datapath = 'data/supplementary_set/Physiological_data/'+hospital + + channels = pd.read_csv("notebooks/channel_table.csv") + selectEEG = channels[channels['Category'].isin(['eeg'])] + + demographics = pd.read_csv(os.path.join('C:/BSICoS/CincChallenge2026/CincChallenge_2026/data/training_set', "demographics.csv")) + + # Datos = pd.DataFrame(columns=['File', 'Channel', 'Sampling_Frequency', 'Duration_sec']) + lista_dir = os.listdir(datapath) + results = [] + + for file in lista_dir: + # Cargar el archivo (sustituye por tu ruta real) + edf = helper_code.edfio.read_edf(os.path.join(datapath, file)) + + id = file[9:-10] # Asumiendo que el ID es el nombre del archivo sin la extensión + + selEEG = [] + labels = [] + data = [] + + # Listar canales para identificar los de interés (ej: C3-M2, O1-M2) + HayEEG = False + for i, sig in enumerate(edf.signals): + # print(f"[{i}] {sig.label}") + # print length fs and duration + # print(f"Length: {len(sig.data)}, Sampling Frequency: {sig.sampling_frequency} Hz, Duration: {len(sig.data)/sig.sampling_frequency:.2f} seconds") + for index in selectEEG.index: + if sig.label.lower() in selectEEG['Channel_Names'][index].lower(): + print(f"Canal seleccionado: {sig.label}") + selEEG.append([i,sig]) + labels.append(sig.label) + HayEEG = True + break + # for i in range(len(edf.signals)): + # print(f"Longitud: {edf.signals[i].data.shape}, Canal: {edf.signals[i].label}, Frecuencia de muestreo: {edf.signals[i].sampling_frequency} Hz, Duración: {len(edf.signals[i].data)/edf.signals[i].sampling_frequency:.2f} segundos") + + if HayEEG: + + Bipolar = pd.DataFrame() + if all(label in labels for label in ["F3", "F4", "M1", "M2"]): + Bipolar['F3-M2'] = edf.signals[edf.labels.index("F3")].data - edf.signals[edf.labels.index("M2")].data + Bipolar['F4-M1'] = edf.signals[edf.labels.index("F4")].data - edf.signals[edf.labels.index("M1")].data + labels2 = ['F3-M2', 'F4-M1'] + if all(label in labels for label in ["C3", "C4", "M1", "M2"]): + Bipolar['C3-M2'] = edf.signals[edf.labels.index("C3")].data - edf.signals[edf.labels.index("M2")].data + Bipolar['C4-M1'] = edf.signals[edf.labels.index("C4")].data - edf.signals[edf.labels.index("M1")].data + labels2.append('C3-M2') + labels2.append('C4-M1') + if all(label in labels for label in ["O2", "O1", "M1", "M2"]): + Bipolar['O2-M2'] = edf.signals[edf.labels.index("O1")].data - edf.signals[edf.labels.index("M2")].data + Bipolar['O1-M1'] = edf.signals[edf.labels.index("O2")].data - edf.signals[edf.labels.index("M1")].data + labels2.append('O1-M1') + labels2.append('O2-M2') + # print(f"Archivo {file} tiene ECG, RESP y EEG. Se procesará con canales bipolares.") + + if not Bipolar.empty: + labels = [] + for col in Bipolar.columns: + # print(f"Archivo: {file}, Canal: {col}, Frecuencia de muestreo: {sig.sampling_frequency} Hz, Duración: {len(Bipolar[col])/sig.sampling_frequency:.2f} segundos") + fs = edf.signals[edf.labels.index("M2")].sampling_frequency # Asumimos que todos los canales tienen la misma frecuencia de muestreo + time = np.linspace(0, len(Bipolar[col]) / fs, len(Bipolar[col])) + fil = EEG_functions.butter_bandpass_filter(Bipolar[col], lowcut=0.3, highcut=35, fs=fs, order=4) + norm = (fil-np.mean(fil))/np.std(fil) + + data.append(norm) # Restar la media para centrar la señal + labels.append(col) + # columns = Bipolar.columns.tolist() + else: + for i, (idx, sig) in enumerate(selEEG): + # print(f"Archivo: {file}, Canal: {sig.label}, Frecuencia de muestreo: {sig.sampling_frequency} Hz, Duración: {len(sig.data)/sig.sampling_frequency:.2f} segundos") + fs = sig.sampling_frequency + time = np.linspace(0, len(sig.data) / fs, len(sig.data)) + fil = EEG_functions.butter_bandpass_filter(sig.data, lowcut=0.3, highcut=35, fs=fs, order=4) + norm = (fil-np.mean(fil))/np.std(fil) + labels.append(sig.label) + data.append(norm) # Restar la media para centrar la señal + + # columns = [selEEG[i][1].label for i in range(len(selEEG))] + + + demographics = demographics[demographics['BDSPPatientID'] == int(id)] + print(demographics) + + for i, elec in enumerate(labels): + epoch_length = 30 # Duración de cada época en segundos + if Bipolar.empty: + fs = edf.signals[edf.labels.index(labels[i])].sampling_frequency + else: + fs = edf.signals[edf.labels.index('M1')].sampling_frequency + + if fs != 200: + # print(f"Warning: Sampling frequency for channel {elec} in file {file} is {fs} Hz, expected 200 Hz. Check the data.") + duration = len(data[i]) / fs + time_original = np.linspace(0, duration, len(data[i])) + + num_samples_target = int(duration * 200 ) + time_target = np.linspace(0, duration, num_samples_target) + data[i] = np.interp(time_target, time_original, data[i]) + fs = 200 # Update fs to the target sampling frequency after resampling + + epochs = EEG_functions.create_epochs(data[i], fs, epoch_duration=epoch_length) + + band_powers, complexities = EEG_functions.extract_band_powers(epochs, fs, win_len=15) + print(f"Band powers for file {file}:") + + band_powers = band_powers.iloc[60:] # Eliminar las primeras 60 épocas (30 min) para evitar el tiempo despierto al inicio de la grabación + print(band_powers.head()) + + + # # Convertir de formato "ancho" a "largo" para Plotly + # df_melted = band_powers.melt(var_name='Banda', value_name='Potencia') + # # Creamos el boxplot + # fig = px.box(df_melted, x='Banda', y='Potencia', + # color='Banda', + # points="outliers", # Para ver si hay épocas muy extrañas + # title=f"{id} - {elec} - {demographics.Cognitive_Impairment.values[0]}", + # log_y=True) # Usamos escala logarítmica porque Delta suele ser mucho más potente que Beta + + # fig.update_layout(template="plotly_white", showlegend=False) + # # fig.write_html(f"graphs/BandasPersona/{id}_{elec}_{demographics.Cognitive_Impairment.values[0]}.html") # Guardar como HTML para visualización interactiva + # fig.show() + + # Ejecución + patient_summar = EEG_functions.get_patient_profile(band_powers) + # print(f"Resumen del perfil del paciente {id} - {elec}:") + # print(patient_summar) + d = complexities.iloc[:].std().to_dict() + results.append({ + 'File': file, + 'Channel': elec, + 'Patient_ID': id, + **d, + **patient_summar + }) + df_results = pd.DataFrame(results) + print(df_results.head()) + return df_results + # df_results.to_csv(f"results_summaryEEG_{hospital}.csv", index=False) \ No newline at end of file diff --git a/src/scripts/Resp_processing.py b/src/scripts/Resp_processing.py new file mode 100644 index 0000000..dfe6ec2 --- /dev/null +++ b/src/scripts/Resp_processing.py @@ -0,0 +1,114 @@ +import numpy as np +import pandas as pd +import os +import matplotlib.pyplot as plt +import plotly.express as px +import sys + +import plotly.graph_objects as go +from plotly.subplots import make_subplots + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +import lib.helper_code as helper_code +import lib.EEG_functions as EEG_functions +import lib.Resp_features as Resp_features + +for hospital in ['I0006',"S0001",'I0004','I0007']:#'I0002', + print(f"Procesando hospital: {hospital}") + + if hospital == 'I0002' or hospital == 'I0006' or hospital == "S0001": + datapath = 'data/training_set/Physiological_data/'+hospital + else: + datapath = 'data/supplementary_set/Physiological_data/'+hospital + + channels = pd.read_csv("notebooks/channel_table.csv") + selectResp = channels[channels['Category'].isin(['resp'])] + + demographics = pd.read_csv(os.path.join('C:/BSICoS/CincChallenge2026/CincChallenge_2026/data/training_set', "demographics.csv")) + + # Datos = pd.DataFrame(columns=['File', 'Channel', 'Sampling_Frequency', 'Duration_sec']) + lista_dir = os.listdir(datapath) + results = [] + + for file in lista_dir: + # Cargar el archivo (sustituye por tu ruta real) + edf = helper_code.edfio.read_edf(os.path.join(datapath, file)) + + id = file[9:-10] # Asumiendo que el ID es el nombre del archivo sin la extensión + + selResp = [] + labels = [] + data = [] + + HayResp = False + for i, sig in enumerate(edf.signals): + for index in selectResp.index: + if sig.label.lower() in selectResp['Channel_Names'][index].lower(): + print(f"Canal seleccionado: {sig.label}") + selResp.append([i,sig]) + labels.append(sig.label) + HayResp = True + # plot en plotly la señal + go.Figure(data=go.Scattergl(x=np.arange(len(sig.data))/sig.sampling_frequency, y=sig.data, mode='lines', name=sig.label)).update_layout(title=f"Señal de {sig.label} - Archivo: {file}", xaxis_title="Tiempo (s)", yaxis_title="Amplitud").show() + # px.line(x=np.arange(len(sig.data))/sig.sampling_frequency, y=sig.data, title=f"Señal de {sig.label} - Archivo: {file}").show() + break + + if HayResp: + for i, (idx, sig) in enumerate(selResp): + print(f"Archivo: {file}, Canal: {sig.label}, Frecuencia de muestreo: {sig.sampling_frequency} Hz, Duración: {len(sig.data)/sig.sampling_frequency:.2f} segundos") + fs = sig.sampling_frequency + + if fs != 25: + duration = len(sig.data) / fs + time_original = np.linspace(0, duration, len(sig.data)) + num_samples_target = int(duration * 25 ) + time_target = np.linspace(0, duration, num_samples_target) + data = np.interp(time_target, time_original, sig.data) + fs = 25 # Update fs to the target sampling frequency after resampling + else: + data = sig.data + time_new = np.linspace(0, len(sig.data) / fs, len(sig.data)) + + # Check nan in sig.data + if np.isnan(sig.data).any(): + print(f"Warning: NaN values found in signal data for {sig.label}. Filling NaNs with zeros.") + data = np.nan_to_num(data) + + # if sig.label not in ["SpO2", "SaO2", "OSAT", "O2SAT", "O2 SAT", "O2-SAT", "O2-SATURATION"]: + # fil = EEG_functions.butter_bandpass_filter(data, lowcut=0.01, highcut=4, fs=fs, order=4) + # # norm = (fil-np.mean(fil))/np.std(fil) + # data.append(fil) # Restar la media para centrar la señal + + if sig.label.lower() in selectResp['Channel_Names'][28].lower() or sig.label.lower() in selectResp['Channel_Names'][29].lower(): + # EFFORT RESPIRATORY + elif sig.label.lower() in selectResp['Channel_Names'][30].lower() or sig.label.lower() in selectResp['Channel_Names'][31].lower(): + # RESPIRATORY Flujo + fil = EEG_functions.butter_bandpass_filter(data, lowcut=0.01, highcut=4, fs=fs, order=4) + Resp_features.peakedness_application(fil, stage=sig.label, plotflag = True, subjet =1) + elif sig.label.lower() in selectResp['Channel_Names'][32].lower() or sig.label.lower() in selectResp['Channel_Names'][33].lower(): + # CEPAP + elif sig.label.lower() in selectResp['Channel_Names'][34].lower(): + #O2 SATURATION + + + + # time_dt = pd.to_datetime(time_new, unit='s') + # # Plot raw and filtered signals + # fig = make_subplots(specs=[[{"secondary_y": True}]]) + # fig.add_trace(go.Scattergl(x=time_dt[::10], y=data[::10], name=sig.label, mode='lines'),secondary_y=False,row=1, col=1) + # fig.add_trace(go.Scattergl(x=time_dt[::10], y=fil[::10], name=f"Normalized {sig.label}", mode='lines'), secondary_y=True,row=1, col=1) + # fig.update_yaxes(title_text="Amplitud Original (uV)", secondary_y=False) + # fig.update_yaxes(title_text="Valor Normalizado (Z-score)", secondary_y=True) + # # update x axis to make time format + # fig.update_xaxes( + # tickformat="%H:%M:%S", # Formato de hora:minuto:segundo + # row=1, col=1 + # ) + # fig.show() + + # Plot spectrogram of raw and filtered signals + # fig = make_subplots(specs=[[{"secondary_y": True}]]) + # fig.add_trace(go.Scattergl(x=time_dt[::10], y=data[::10], name=sig.label, mode='lines'),secondary_y=False,row=1, col=1) + # fig.add_trace(go.Scattergl(x=time_dt[::10], y=fil[::10], name=f"Normalized {sig.label}", mode='lines'), secondary_y=True,row=1, col=1) + # fig.update_yaxes(title_text="Amplitud Original (uV)", secondary_y=False) + diff --git a/src/scripts/ResultsAnalysis.py b/src/scripts/ResultsAnalysis.py new file mode 100644 index 0000000..c2fef8b --- /dev/null +++ b/src/scripts/ResultsAnalysis.py @@ -0,0 +1,67 @@ +import numpy as np +import pandas as pd +import sys +import os +import plotly.express as px + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +import lib.EEG_functions as EEG_functions +import seaborn as sns +import matplotlib.pyplot as plt +import plotly.express as px + +hospital = ["I0006","I0002","I0004","I0007", "S0001"] +results = pd.DataFrame() +for h in hospital: + results = pd.concat([results, pd.read_csv(f"results_summaryEEG_{h}.csv")], ignore_index=True) + +demographics = pd.read_csv(os.path.join('C:/BSICoS/CincChallenge2026/CincChallenge_2026/data/training_set', "demographics.csv")) +demographics = pd.concat([demographics, pd.read_csv(os.path.join('C:/BSICoS/CincChallenge2026/CincChallenge_2026/data/supplementary_set', "demographics.csv"))], ignore_index=True) + +for index, row in results.iterrows(): + patient_id = row['Patient_ID'] + hospital_id = row['File'][4:9] # Asumiendo que los primeros 5 caracteres del nombre del archivo indican el hospital + demographics_row = demographics[(demographics['BDSPPatientID'] == patient_id) & (demographics['SiteID'] == hospital_id)] + if not demographics_row.empty: + cognitive_impairment = demographics_row['Cognitive_Impairment'].values[0] + time_to_event = demographics_row['Time_to_Event'].values[0] + results.at[index, 'Hospital'] = hospital_id + results.at[index, 'CognitiveImpairment'] = cognitive_impairment + results.at[index, 'Time_to_Event'] = time_to_event + else: + results.at[index, 'Hospital'] = hospital_id + results.at[index, 'CognitiveImpairment'] = np.nan # O cualquier valor que indique que no se encontró información + results.at[index, 'Time_to_Event'] = np.nan + +df = pd.DataFrame(results) +# Agrupar por electrodo +for elec in results['Channel'].unique(): + subset = results[results['Channel'] == elec] + + print(subset.Hospital.unique()) + # Hacer un boxplot de cada característica que separe entre pacientes con congnitive impairment y sin él + for col in subset.columns[3:-2]: + print(col) + + fig = px.box(subset, + x='CognitiveImpairment', + y=col, + color='CognitiveImpairment', + notched=True, + points="all", + hover_data=['Patient_ID', 'Channel'], + title=f"{elec} - Comparativa de {col} según Estado Cognitivo") + + fig.update_layout(template="plotly_white") + fig.write_html(f"graphs/ComparativaCognitiveImpairment/2segundos/PorHospital/html/{elec}_{col}.html") # Guardar como HTML para visualización interactiva + # fig.delete_traces([0]) # Eliminar la leyenda para que no se repita en cada gráfico + + # Generar el boxplot con Seaborn (Extremadamente rápido) + plt.figure(figsize=(10, 6)) + sns.boxplot(data=df, x='CognitiveImpairment', y=col, hue='CognitiveImpairment', notch=True) + sns.stripplot(data=df, x='CognitiveImpairment', y=col, color="black", alpha=0.3, size=3) # Equivalente a points="all" + + plt.title(f"Comparativa {elec} - {col}") + plt.savefig(f"graphs/ComparativaCognitiveImpairment/2segundos/PorHospital/{hospital_id}/png/{elec}_{col}.png", dpi=100) + # \graphs\ComparativaCognitiveImpairment\2segundos\PorHospital\I0006\png + plt.close() # ¡Importante! Para no saturar la memoria RAM \ No newline at end of file From 02a8229b006227596f5530bc74cc34b2b05dcfbb Mon Sep 17 00:00:00 2001 From: dcajal Date: Mon, 2 Mar 2026 18:02:19 +0100 Subject: [PATCH 05/38] Visual noise corrected --- src/scripts/{ResultsAnalysis.py => results_analysis.py} | 0 src/scripts/{EEG_Segmenting.py => segmentation.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename src/scripts/{ResultsAnalysis.py => results_analysis.py} (100%) rename src/scripts/{EEG_Segmenting.py => segmentation.py} (100%) diff --git a/src/scripts/ResultsAnalysis.py b/src/scripts/results_analysis.py similarity index 100% rename from src/scripts/ResultsAnalysis.py rename to src/scripts/results_analysis.py diff --git a/src/scripts/EEG_Segmenting.py b/src/scripts/segmentation.py similarity index 100% rename from src/scripts/EEG_Segmenting.py rename to src/scripts/segmentation.py From 0484d8ac91f30aa1045c5d80eba4e7c2a423472b Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 13:08:14 +0100 Subject: [PATCH 06/38] Refactor --- src/{scripts/EEG_processing.py => eeg_processing.py} | 0 src/{scripts/Resp_processing.py => resp_processing.py} | 0 src/{scripts => }/results_analysis.py | 0 src/{scripts => }/segmentation.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename src/{scripts/EEG_processing.py => eeg_processing.py} (100%) rename src/{scripts/Resp_processing.py => resp_processing.py} (100%) rename src/{scripts => }/results_analysis.py (100%) rename src/{scripts => }/segmentation.py (100%) diff --git a/src/scripts/EEG_processing.py b/src/eeg_processing.py similarity index 100% rename from src/scripts/EEG_processing.py rename to src/eeg_processing.py diff --git a/src/scripts/Resp_processing.py b/src/resp_processing.py similarity index 100% rename from src/scripts/Resp_processing.py rename to src/resp_processing.py diff --git a/src/scripts/results_analysis.py b/src/results_analysis.py similarity index 100% rename from src/scripts/results_analysis.py rename to src/results_analysis.py diff --git a/src/scripts/segmentation.py b/src/segmentation.py similarity index 100% rename from src/scripts/segmentation.py rename to src/segmentation.py From 9a73d0d2d40d71b14377c8ff730f9c2d8da7f527 Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 13:08:34 +0100 Subject: [PATCH 07/38] Add scripts for creating and managing smoke training datasets --- scripts/create_smoke.ps1 | 41 +++++++++++++ scripts/run.ps1 | 123 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 scripts/create_smoke.ps1 create mode 100644 scripts/run.ps1 diff --git a/scripts/create_smoke.ps1 b/scripts/create_smoke.ps1 new file mode 100644 index 0000000..bd39bcd --- /dev/null +++ b/scripts/create_smoke.ps1 @@ -0,0 +1,41 @@ +# ============================================ +# Create smoke training dataset +# ============================================ + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# IMPORTANT: +# Each team member must modify this path to +# match their local dataset location. +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +$FULL_DATA_PATH = "data/training_set" # <-- CHANGE THIS IF NEEDED + +$SMOKE_PATH = "data/training_smoke" +$N_RECORDS = 5 + +Write-Host "Creating smoke dataset..." +Write-Host "Source: $FULL_DATA_PATH" +Write-Host "Destination: $SMOKE_PATH" + +Remove-Item -Recurse -Force $SMOKE_PATH -ErrorAction SilentlyContinue +New-Item -ItemType Directory -Force -Path $SMOKE_PATH | Out-Null + +# Copy demographics +Copy-Item "$FULL_DATA_PATH/demographics.csv" "$SMOKE_PATH/demographics.csv" + +# Select first N EDF files +$edfs = Get-ChildItem "$FULL_DATA_PATH/physiological_data" -Recurse -Filter *.edf | + Sort-Object FullName | + Select-Object -First $N_RECORDS + +foreach ($f in $edfs) { + $rel = $f.FullName.Substring((Resolve-Path $FULL_DATA_PATH).Path.Length).TrimStart('\') + $target = Join-Path $SMOKE_PATH $rel + New-Item -ItemType Directory -Force -Path (Split-Path $target) | Out-Null + Copy-Item $f.FullName $target +} + +# Copy full annotation folders (simpler and robust) +Copy-Item "$FULL_DATA_PATH/algorithmic_annotations" "$SMOKE_PATH/algorithmic_annotations" -Recurse -ErrorAction SilentlyContinue +Copy-Item "$FULL_DATA_PATH/human_annotations" "$SMOKE_PATH/human_annotations" -Recurse -ErrorAction SilentlyContinue + +Write-Host "Smoke dataset created successfully." \ No newline at end of file diff --git a/scripts/run.ps1 b/scripts/run.ps1 new file mode 100644 index 0000000..de6122d --- /dev/null +++ b/scripts/run.ps1 @@ -0,0 +1,123 @@ +param( + [Parameter(Mandatory=$true)] + [ValidateSet("build","smoke","train","train-smoke","run","run-smoke","clean")] + [string]$Command +) + +# ============================================ +# CONFIGURATION +# ============================================ + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# IMPORTANTE: +# Si tu dataset no está en data/training_set, +# modifica esta ruta. +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +$FULL_DATA="data/training_set" + +$SMOKE_DATA="data/training_smoke" + +$IMAGE_NAME="cinc2026" +$MODEL_FULL="model" +$MODEL_SMOKE="model_smoke" +$OUT_FULL="outputs" +$OUT_SMOKE="outputs_smoke" + +# ============================================ +# FUNCTIONS +# ============================================ + +function Build-Image { + docker build -t $IMAGE_NAME . +} + +function Create-Smoke { + Write-Host "Creating smoke dataset..." + powershell -ExecutionPolicy Bypass -File scripts/create_smoke.ps1 +} + +function Train-Full { + New-Item -ItemType Directory -Force -Path $MODEL_FULL | Out-Null + + docker run --rm ` + -v "${FULL_DATA}:/challenge/training_data:ro" ` + -v "${PWD}/${MODEL_FULL}:/challenge/model" ` + $IMAGE_NAME ` + python train_model.py -d training_data -m model -v +} + +function Train-Smoke { + New-Item -ItemType Directory -Force -Path $MODEL_SMOKE | Out-Null + + docker run --rm ` + -v "${SMOKE_DATA}:/challenge/training_data:ro" ` + -v "${PWD}/${MODEL_SMOKE}:/challenge/model" ` + $IMAGE_NAME ` + python train_model.py -d training_data -m model -v +} + +function Run-Full { + New-Item -ItemType Directory -Force -Path $OUT_FULL | Out-Null + + docker run --rm ` + -v "${FULL_DATA}:/challenge/holdout_data:ro" ` + -v "${PWD}/${MODEL_FULL}:/challenge/model:ro" ` + -v "${PWD}/${OUT_FULL}:/challenge/holdout_outputs" ` + $IMAGE_NAME ` + python run_model.py -d holdout_data -m model -o holdout_outputs -v +} + +function Run-Smoke { + New-Item -ItemType Directory -Force -Path $OUT_SMOKE | Out-Null + + docker run --rm ` + -v "${SMOKE_DATA}:/challenge/holdout_data:ro" ` + -v "${PWD}/${MODEL_SMOKE}:/challenge/model:ro" ` + -v "${PWD}/${OUT_SMOKE}:/challenge/holdout_outputs" ` + $IMAGE_NAME ` + python run_model.py -d holdout_data -m model -o holdout_outputs -v +} + +function Clean-All { + Remove-Item -Recurse -Force $MODEL_FULL -ErrorAction SilentlyContinue + Remove-Item -Recurse -Force $MODEL_SMOKE -ErrorAction SilentlyContinue + Remove-Item -Recurse -Force $OUT_FULL -ErrorAction SilentlyContinue + Remove-Item -Recurse -Force $OUT_SMOKE -ErrorAction SilentlyContinue + Write-Host "Cleaned model and output folders." +} + +# ============================================ +# COMMAND SWITCH +# ============================================ + +switch ($Command) { + + "build" { + Build-Image + } + + "smoke" { + Create-Smoke + } + + "train" { + Train-Full + } + + "train-smoke" { + Train-Smoke + } + + "run" { + Run-Full + } + + "run-smoke" { + Run-Smoke + } + + "clean" { + Clean-All + } + +} \ No newline at end of file From de17158bf7ff541ede1301d5af81b5bfbeecbe5e Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 13:08:45 +0100 Subject: [PATCH 08/38] Add documentation for project overview, Docker usage, smoke dataset, and unified run script --- docs/01_overview.md | 35 ++++++++++++ docs/02_docker.md | 53 ++++++++++++++++++ docs/03_smoke_dataset.md | 51 +++++++++++++++++ docs/04_run_script.md | 118 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 257 insertions(+) create mode 100644 docs/01_overview.md create mode 100644 docs/02_docker.md create mode 100644 docs/03_smoke_dataset.md create mode 100644 docs/04_run_script.md diff --git a/docs/01_overview.md b/docs/01_overview.md new file mode 100644 index 0000000..9805383 --- /dev/null +++ b/docs/01_overview.md @@ -0,0 +1,35 @@ +# CINC 2026 – Visión General del Proyecto + +Estamos participando en el Challenge 2026 de Computing in Cardiology. + +El objetivo es predecir deterioro cognitivo a partir de datos de polisomnografía (PSG). + +## Cómo nos evaluarán + +La organización: + +1. Construirá nuestra imagen Docker. +2. Ejecutará `train_model.py`. +3. Ejecutará `run_model.py`. +4. Evaluará las predicciones generadas. + +Por tanto, la reproducibilidad mediante Docker es obligatoria. + +Nuestro objetivo es garantizar que: +- El código se ejecuta sin intervención manual. +- El modelo se entrena correctamente. +- Las predicciones se generan en el formato requerido. + +## Qué se puede modificar y qué no + +❌ No modificar + +- `train_model.py` +- `run_model.py` +- `helper_code.py` +- `evaluate_model.py` + +✅ Modificar/Añadir + +- `team_code.py` <-- Toda la lógica científica y de modelado debe implementarse ahí. +- Helpers, scripts, métodos: añadir a voluntad en `src/` \ No newline at end of file diff --git a/docs/02_docker.md b/docs/02_docker.md new file mode 100644 index 0000000..eca7735 --- /dev/null +++ b/docs/02_docker.md @@ -0,0 +1,53 @@ +# Uso de Docker + +## Requisitos + +- Docker Desktop instalado (modo Linux containers) +- Dataset descargado desde Kaggle +- Se asume que el dataset está en: data/training_set/ + +Cada miembro del equipo puede tener el dataset en una ubicación diferente, pero en este repositorio asumimos que está dentro de `data/`. + +--- + +## Construir la imagen + +Desde la raíz del repositorio: + +```powershell +docker build -t cinc2026 . +``` + +## Entenar con el dataset completo + +```powershell +$DATA="data/training_set" +$MODEL="$PWD/model" + +docker run --rm ` + -v "${DATA}:/challenge/training_data:ro" ` + -v "${MODEL}:/challenge/model" ` + cinc2026 ` + python train_model.py -d training_data -m model -v +``` + +## Generar predicciones + +```powershell +$OUT="$PWD/outputs" + +docker run --rm ` + -v "${DATA}:/challenge/holdout_data:ro" ` + -v "${MODEL}:/challenge/model:ro" ` + -v "${OUT}:/challenge/holdout_outputs" ` + cinc2026 ` + python run_model.py -d holdout_data -m model -o holdout_outputs -v +``` + +## Resultado esperado + +En la carpeta `outputs/` se generará un `demographics.csv` con: + +- Columnas originales +- Cognitive_Impairment +- Cognitive_Impairment_Probability \ No newline at end of file diff --git a/docs/03_smoke_dataset.md b/docs/03_smoke_dataset.md new file mode 100644 index 0000000..a334088 --- /dev/null +++ b/docs/03_smoke_dataset.md @@ -0,0 +1,51 @@ +# Dataset Smoke (Desarrollo Rápido) + +Entrenar con el dataset completo tarda aproximadamente 30–40 minutos con el modelo de ejemplo. + +Para desarrollo utilizamos un dataset reducido (5 sujetos). + +--- + +## Crear dataset smoke + +Ejecutar: + +```powershell +powershell -ExecutionPolicy Bypass -File scripts/create_smoke.ps1 +``` + +Esto generará: `data/training_smoke/` + +## Entrenar con el smoke dataset + +```powershell +$DATA="data/training_smoke" +$MODEL="$PWD/model_smoke" + +docker run --rm ` + -v "${DATA}:/challenge/training_data:ro" ` + -v "${MODEL}:/challenge/model" ` + cinc2026 ` + python train_model.py -d training_data -m model -v +``` + +## Generar predicciones con smoke dataset + +```powershell +$OUT="$PWD/outputs_smoke" + +docker run --rm ` + -v "${DATA}:/challenge/holdout_data:ro" ` + -v "${MODEL}:/challenge/model:ro" ` + -v "${OUT}:/challenge/holdout_outputs" ` + cinc2026 ` + python run_model.py -d holdout_data -m model -o holdout_outputs -v +``` + +## ¿Cuándo usar smoke? + +- Desarrollo de nuevas features +- Comprobación rápida de que el código no rompe +- Validación de cambios en team_code.py + +Nunca usar smoke para evaluar rendimiento final. \ No newline at end of file diff --git a/docs/04_run_script.md b/docs/04_run_script.md new file mode 100644 index 0000000..694c4f3 --- /dev/null +++ b/docs/04_run_script.md @@ -0,0 +1,118 @@ +# Script unificado de ejecución (`run.ps1`) + +Para simplificar el trabajo del equipo hemos creado un único script que encapsula todos los comandos necesarios para: + +- Construir la imagen Docker +- Crear el dataset smoke +- Entrenar (completo o smoke) +- Generar predicciones +- Limpiar artefactos + +--- + +## Requisitos + +- Docker Desktop instalado +- Dataset descargado en: `data/training_set/` + + +⚠️ Si el dataset está en otra ubicación, modificar la variable `$FULL_DATA` dentro de `run.ps1`. + +--- + +# Comandos disponibles + +Desde la raíz del repositorio: + +## 1️⃣ Construir la imagen Docker + +```powershell +.\run.ps1 build +``` + +Solo es necesario hacerlo: + +- La primera vez +- Cuando cambien dependencias o el Dockerfile + +## 2️⃣ Crear dataset smoke (5 sujetos) + +```powershell +.\run.ps1 smoke +``` + +Genera: `data/training_smoke/` + +Este dataset se usa para desarrollo rápido. + +## 3️⃣ Entrenar modelo + +### Smoke (rápido) + +```powershell +.\run.ps1 train-smoke +``` + + +### Completo + +```powershell +.\run.ps1 train +``` + +El modelo se guarda en: + +- model/ (full) +- model_smoke/ (smoke) + +## 4️⃣ Generar predicciones + +### Smoke + +```powershell +.\run.ps1 run-smoke +``` + +### Completo + +```powershell +.\run.ps1 run +``` + +Los resultados se generan en: + +- outputs/ +- outputs_smoke/ + +El archivo clave es: `demographics.csv` que contiene las predicciones añadidas. + +## 5️⃣ Limpiar artefactos + +```powershell +.\run.ps1 clean +``` + +Elimina: + +- model/ +- model_smoke/ +- outputs/ +- outputs_smoke/ + +No elimina datasets. + +# Flujo recomendado para desarrollo + +```powershell +.\run.ps1 build +.\run.ps1 smoke +.\run.ps1 train-smoke +.\run.ps1 run-smoke +``` + +Solo cuando el modelo esté estable: + +```powershell +.\run.ps1 train +.\run.ps1 run +``` \ No newline at end of file From fb6216cca9fef188cb66b7617d88631370e608be Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 13:25:43 +0100 Subject: [PATCH 09/38] Add .dockerignore file to exclude datasets, artifacts, and OS/IDE files --- .dockerignore | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d8e589d --- /dev/null +++ b/.dockerignore @@ -0,0 +1,23 @@ +# Datasets +data/ +**/training_set/ +**/supplementary_set/ +**/*.edf + +# Artifacts +model/ +model_smoke/ +model_full_smoke/ +outputs/ +outputs_smoke/ +__pycache__/ +*.pyc +*.pkl +*.sav +*.joblib + +# OS / IDE +.DS_Store +Thumbs.db +.vscode/ +.idea/ \ No newline at end of file From 689fd97c9f55421d76cbd425df9d6b0866d817a1 Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 13:29:29 +0100 Subject: [PATCH 10/38] Update .gitignore to include datasets, model artifacts, and output directories --- .gitignore | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f298c64..7cc07cb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,29 @@ +# Dataset +data/ + +# Model artifacts +model/ +model_smoke/ +*.pkl +*.sav +*.joblib + +# Outputs +outputs/ +outputs_smoke/ + +# Python +__pycache__/ +*.pyc + +# OS +.DS_Store +Thumbs.db + +# IDE +.vscode/ +.idea/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[codz] @@ -49,7 +75,6 @@ coverage.xml .hypothesis/ .pytest_cache/ cover/ -data/ graphs/ graphs From d3a10dbcb9786b8de71e2dd9553a4cb4b89831b6 Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 13:47:18 +0100 Subject: [PATCH 11/38] Fix paths for smoke dataset training in documentation --- docs/03_smoke_dataset.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/03_smoke_dataset.md b/docs/03_smoke_dataset.md index a334088..f91249d 100644 --- a/docs/03_smoke_dataset.md +++ b/docs/03_smoke_dataset.md @@ -19,8 +19,8 @@ Esto generará: `data/training_smoke/` ## Entrenar con el smoke dataset ```powershell -$DATA="data/training_smoke" -$MODEL="$PWD/model_smoke" +$DATA="$PWD\data\training_smoke" +$MODEL="$PWD\model_smoke" docker run --rm ` -v "${DATA}:/challenge/training_data:ro" ` From 2f63033b3cd0145a60b8a5873f77f555871f208d Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 13:49:42 +0100 Subject: [PATCH 12/38] Add progress bar functionality and print filtering for model execution --- team_code.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/team_code.py b/team_code.py index a055222..e8152a8 100644 --- a/team_code.py +++ b/team_code.py @@ -12,6 +12,9 @@ import joblib import numpy as np import os +import atexit +import builtins +import re from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor import sys from tqdm import tqdm @@ -28,6 +31,46 @@ # Build the absolute path to the CSV file relative to the script location DEFAULT_CSV_PATH = os.path.join(SCRIPT_DIR, 'channel_table.csv') +# Progress bar state for run_model (initialized lazily) +RUN_MODEL_PBAR = None +RUN_MODEL_PBAR_TOTAL = None +ORIGINAL_PRINT = builtins.print +PRINT_FILTER_ACTIVE = False +RUN_PROGRESS_LINE_RE = re.compile(r'^-\s+\d+/\d+:\s') + + +def _close_run_model_pbar(): + global RUN_MODEL_PBAR + if RUN_MODEL_PBAR is not None: + RUN_MODEL_PBAR.close() + RUN_MODEL_PBAR = None + + +def _install_run_print_filter(): + global PRINT_FILTER_ACTIVE + if PRINT_FILTER_ACTIVE: + return + + def _filtered_print(*args, **kwargs): + message = kwargs.get('sep', ' ').join(str(a) for a in args) if args else '' + if RUN_PROGRESS_LINE_RE.match(message): + return + return ORIGINAL_PRINT(*args, **kwargs) + + builtins.print = _filtered_print + PRINT_FILTER_ACTIVE = True + + +def _restore_print(): + global PRINT_FILTER_ACTIVE + if PRINT_FILTER_ACTIVE: + builtins.print = ORIGINAL_PRINT + PRINT_FILTER_ACTIVE = False + + +atexit.register(_close_run_model_pbar) +atexit.register(_restore_print) + ################################################################################ # @@ -150,6 +193,9 @@ def train_model(data_folder, model_folder, verbose, csv_path=DEFAULT_CSV_PATH): # Load your trained models. This function is *required*. You should edit this function to add your code, but do *not* change the # arguments of this function. If you do not train one of the models, then you can return None for the model. def load_model(model_folder, verbose): + if verbose: + _install_run_print_filter() + model_filename = os.path.join(model_folder, 'model.sav') model = joblib.load(model_filename) return model @@ -157,6 +203,8 @@ def load_model(model_folder, verbose): # Run your trained model. This function is *required*. You should edit this function to add your code, but do *not* change the # arguments of this function. def run_model(model, record, data_folder, verbose): + global RUN_MODEL_PBAR, RUN_MODEL_PBAR_TOTAL + # Load the model. model = model['model'] @@ -165,6 +213,27 @@ def run_model(model, record, data_folder, verbose): site_id = record[HEADERS['site_id']] session_id = record[HEADERS['session_id']] + # Initialize tqdm progress bar lazily so it advances across run_model calls. + if verbose and RUN_MODEL_PBAR is None: + patient_data_file = os.path.join(data_folder, DEMOGRAPHICS_FILE) + try: + RUN_MODEL_PBAR_TOTAL = len(find_patients(patient_data_file)) + except Exception: + RUN_MODEL_PBAR_TOTAL = None + + RUN_MODEL_PBAR = tqdm( + total=RUN_MODEL_PBAR_TOTAL, + desc="Running Model", + unit="record", + leave=True, + file=sys.stdout, + delay=0.5, + disable=not verbose + ) + + if verbose and RUN_MODEL_PBAR is not None: + RUN_MODEL_PBAR.set_postfix({"patient": patient_id}) + # Load the patient data. patient_data_file = os.path.join(data_folder, DEMOGRAPHICS_FILE) patient_data = load_demographics(patient_data_file, patient_id, session_id) @@ -195,6 +264,12 @@ def run_model(model, record, data_folder, verbose): binary_output = model.predict(features)[0] probability_output = model.predict_proba(features)[0][1] + if verbose and RUN_MODEL_PBAR is not None: + RUN_MODEL_PBAR.update(1) + if RUN_MODEL_PBAR_TOTAL is not None and RUN_MODEL_PBAR.n >= RUN_MODEL_PBAR_TOTAL: + RUN_MODEL_PBAR.close() + RUN_MODEL_PBAR = None + return binary_output, probability_output ################################################################################ From 5e05482d486213ccc5b55165c46ac30f8ade7ddf Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 13:56:15 +0100 Subject: [PATCH 13/38] Update run script and documentation to include new commands for development mode --- docs/04_run_script.md | 32 ++++---- scripts/run.ps1 | 169 +++++++++++++++++++++++++++++------------- 2 files changed, 136 insertions(+), 65 deletions(-) diff --git a/docs/04_run_script.md b/docs/04_run_script.md index 694c4f3..361df34 100644 --- a/docs/04_run_script.md +++ b/docs/04_run_script.md @@ -22,12 +22,18 @@ Para simplificar el trabajo del equipo hemos creado un único script que encapsu # Comandos disponibles +Nota: Si PowerShell bloquea la ejecución, ejecutar primero: + +```powershell +Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass +``` + Desde la raíz del repositorio: ## 1️⃣ Construir la imagen Docker ```powershell -.\run.ps1 build +.\scripts\run.ps1 build ``` Solo es necesario hacerlo: @@ -38,7 +44,7 @@ Solo es necesario hacerlo: ## 2️⃣ Crear dataset smoke (5 sujetos) ```powershell -.\run.ps1 smoke +.\scripts\run.ps1 smoke ``` Genera: `data/training_smoke/` @@ -50,14 +56,14 @@ Este dataset se usa para desarrollo rápido. ### Smoke (rápido) ```powershell -.\run.ps1 train-smoke +.\scripts\run.ps1 train-smoke ``` ### Completo ```powershell -.\run.ps1 train +.\scripts\run.ps1 train ``` El modelo se guarda en: @@ -70,13 +76,13 @@ El modelo se guarda en: ### Smoke ```powershell -.\run.ps1 run-smoke +.\scripts\run.ps1 run-smoke ``` ### Completo ```powershell -.\run.ps1 run +.\scripts\run.ps1 run ``` Los resultados se generan en: @@ -89,7 +95,7 @@ El archivo clave es: `demographics.csv` que contiene las predicciones añadidas. ## 5️⃣ Limpiar artefactos ```powershell -.\run.ps1 clean +.\scripts\run.ps1 clean ``` Elimina: @@ -104,15 +110,15 @@ No elimina datasets. # Flujo recomendado para desarrollo ```powershell -.\run.ps1 build -.\run.ps1 smoke -.\run.ps1 train-smoke -.\run.ps1 run-smoke +.\scripts\run.ps1 build +.\scripts\run.ps1 smoke +.\scripts\run.ps1 train-smoke +.\scripts\run.ps1 run-smoke ``` Solo cuando el modelo esté estable: ```powershell -.\run.ps1 train -.\run.ps1 run +.\scripts\run.ps1 train +.\scripts\run.ps1 run ``` \ No newline at end of file diff --git a/scripts/run.ps1 b/scripts/run.ps1 index de6122d..41a9df7 100644 --- a/scripts/run.ps1 +++ b/scripts/run.ps1 @@ -1,11 +1,21 @@ param( [Parameter(Mandatory=$true)] - [ValidateSet("build","smoke","train","train-smoke","run","run-smoke","clean")] + [ValidateSet( + "build", + "smoke", + "train", + "train-smoke", + "run", + "run-smoke", + "train-dev", + "run-dev", + "clean" + )] [string]$Command ) # ============================================ -# CONFIGURATION +# CONFIGURACIÓN # ============================================ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -13,18 +23,33 @@ param( # Si tu dataset no está en data/training_set, # modifica esta ruta. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -$FULL_DATA="data/training_set" +$FULL_DATA_REL = "data/training_set" +$SMOKE_DATA_REL = "data/training_smoke" -$SMOKE_DATA="data/training_smoke" +$IMAGE_NAME = "cinc2026" -$IMAGE_NAME="cinc2026" -$MODEL_FULL="model" -$MODEL_SMOKE="model_smoke" -$OUT_FULL="outputs" -$OUT_SMOKE="outputs_smoke" +$MODEL_FULL_REL = "model" +$MODEL_SMOKE_REL = "model_smoke" + +$OUT_FULL_REL = "outputs" +$OUT_SMOKE_REL = "outputs_smoke" + +# ============================================ +# FUNCIONES AUXILIARES +# ============================================ + +function Get-AbsolutePath($relativePath) { + return (Resolve-Path $relativePath).Path +} + +function Ensure-Directory($path) { + if (!(Test-Path $path)) { + New-Item -ItemType Directory -Force -Path $path | Out-Null + } +} # ============================================ -# FUNCTIONS +# COMANDOS # ============================================ function Build-Image { @@ -32,92 +57,132 @@ function Build-Image { } function Create-Smoke { - Write-Host "Creating smoke dataset..." + Write-Host "Creando dataset smoke..." powershell -ExecutionPolicy Bypass -File scripts/create_smoke.ps1 } function Train-Full { - New-Item -ItemType Directory -Force -Path $MODEL_FULL | Out-Null + + $FULL_DATA = Get-AbsolutePath $FULL_DATA_REL + $MODEL_FULL = Join-Path (Get-AbsolutePath ".") $MODEL_FULL_REL + + Ensure-Directory $MODEL_FULL docker run --rm ` -v "${FULL_DATA}:/challenge/training_data:ro" ` - -v "${PWD}/${MODEL_FULL}:/challenge/model" ` + -v "${MODEL_FULL}:/challenge/model" ` $IMAGE_NAME ` python train_model.py -d training_data -m model -v } function Train-Smoke { - New-Item -ItemType Directory -Force -Path $MODEL_SMOKE | Out-Null + + $SMOKE_DATA = Get-AbsolutePath $SMOKE_DATA_REL + $MODEL_SMOKE = Join-Path (Get-AbsolutePath ".") $MODEL_SMOKE_REL + + Ensure-Directory $MODEL_SMOKE docker run --rm ` -v "${SMOKE_DATA}:/challenge/training_data:ro" ` - -v "${PWD}/${MODEL_SMOKE}:/challenge/model" ` + -v "${MODEL_SMOKE}:/challenge/model" ` $IMAGE_NAME ` python train_model.py -d training_data -m model -v } function Run-Full { - New-Item -ItemType Directory -Force -Path $OUT_FULL | Out-Null + + $FULL_DATA = Get-AbsolutePath $FULL_DATA_REL + $MODEL_FULL = Get-AbsolutePath $MODEL_FULL_REL + $OUT_FULL = Join-Path (Get-AbsolutePath ".") $OUT_FULL_REL + + Ensure-Directory $OUT_FULL docker run --rm ` -v "${FULL_DATA}:/challenge/holdout_data:ro" ` - -v "${PWD}/${MODEL_FULL}:/challenge/model:ro" ` - -v "${PWD}/${OUT_FULL}:/challenge/holdout_outputs" ` + -v "${MODEL_FULL}:/challenge/model:ro" ` + -v "${OUT_FULL}:/challenge/holdout_outputs" ` $IMAGE_NAME ` python run_model.py -d holdout_data -m model -o holdout_outputs -v } function Run-Smoke { - New-Item -ItemType Directory -Force -Path $OUT_SMOKE | Out-Null + + $SMOKE_DATA = Get-AbsolutePath $SMOKE_DATA_REL + $MODEL_SMOKE = Get-AbsolutePath $MODEL_SMOKE_REL + $OUT_SMOKE = Join-Path (Get-AbsolutePath ".") $OUT_SMOKE_REL + + Ensure-Directory $OUT_SMOKE docker run --rm ` -v "${SMOKE_DATA}:/challenge/holdout_data:ro" ` - -v "${PWD}/${MODEL_SMOKE}:/challenge/model:ro" ` - -v "${PWD}/${OUT_SMOKE}:/challenge/holdout_outputs" ` + -v "${MODEL_SMOKE}:/challenge/model:ro" ` + -v "${OUT_SMOKE}:/challenge/holdout_outputs" ` $IMAGE_NAME ` python run_model.py -d holdout_data -m model -o holdout_outputs -v } -function Clean-All { - Remove-Item -Recurse -Force $MODEL_FULL -ErrorAction SilentlyContinue - Remove-Item -Recurse -Force $MODEL_SMOKE -ErrorAction SilentlyContinue - Remove-Item -Recurse -Force $OUT_FULL -ErrorAction SilentlyContinue - Remove-Item -Recurse -Force $OUT_SMOKE -ErrorAction SilentlyContinue - Write-Host "Cleaned model and output folders." +# ====================== +# MODO DESARROLLO (SIN REBUILD) +# ====================== + +function Train-Dev { + + $CODE_PATH = Get-AbsolutePath "." + $SMOKE_DATA = Get-AbsolutePath $SMOKE_DATA_REL + $MODEL_SMOKE = Join-Path $CODE_PATH $MODEL_SMOKE_REL + + Ensure-Directory $MODEL_SMOKE + + docker run --rm ` + -v "${CODE_PATH}:/challenge" ` + -v "${SMOKE_DATA}:/challenge/training_data:ro" ` + -v "${MODEL_SMOKE}:/challenge/model" ` + $IMAGE_NAME ` + python train_model.py -d training_data -m model -v } -# ============================================ -# COMMAND SWITCH -# ============================================ +function Run-Dev { -switch ($Command) { + $CODE_PATH = Get-AbsolutePath "." + $SMOKE_DATA = Get-AbsolutePath $SMOKE_DATA_REL + $MODEL_SMOKE = Get-AbsolutePath $MODEL_SMOKE_REL + $OUT_SMOKE = Join-Path $CODE_PATH $OUT_SMOKE_REL - "build" { - Build-Image - } + Ensure-Directory $OUT_SMOKE - "smoke" { - Create-Smoke - } + docker run --rm ` + -v "${CODE_PATH}:/challenge" ` + -v "${SMOKE_DATA}:/challenge/holdout_data:ro" ` + -v "${MODEL_SMOKE}:/challenge/model:ro" ` + -v "${OUT_SMOKE}:/challenge/holdout_outputs" ` + $IMAGE_NAME ` + python run_model.py -d holdout_data -m model -o holdout_outputs -v +} - "train" { - Train-Full - } +function Clean-All { - "train-smoke" { - Train-Smoke - } + Remove-Item -Recurse -Force $MODEL_FULL_REL -ErrorAction SilentlyContinue + Remove-Item -Recurse -Force $MODEL_SMOKE_REL -ErrorAction SilentlyContinue + Remove-Item -Recurse -Force $OUT_FULL_REL -ErrorAction SilentlyContinue + Remove-Item -Recurse -Force $OUT_SMOKE_REL -ErrorAction SilentlyContinue - "run" { - Run-Full - } + Write-Host "Modelos y outputs eliminados." +} - "run-smoke" { - Run-Smoke - } +# ============================================ +# SWITCH PRINCIPAL +# ============================================ - "clean" { - Clean-All - } +switch ($Command) { + + "build" { Build-Image } + "smoke" { Create-Smoke } + "train" { Train-Full } + "train-smoke" { Train-Smoke } + "run" { Run-Full } + "run-smoke" { Run-Smoke } + "train-dev" { Train-Dev } + "run-dev" { Run-Dev } + "clean" { Clean-All } } \ No newline at end of file From 4d080a199be4b15bcca201f1d04b23cd8eb040b0 Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 14:06:31 +0100 Subject: [PATCH 14/38] Add unified run script with commands for building, training, and running models --- docs/04_run_script.md | 169 ++++++++++++++++++++++++------------- scripts/run.ps1 => run.ps1 | 0 2 files changed, 112 insertions(+), 57 deletions(-) rename scripts/run.ps1 => run.ps1 (100%) diff --git a/docs/04_run_script.md b/docs/04_run_script.md index 361df34..61cccd3 100644 --- a/docs/04_run_script.md +++ b/docs/04_run_script.md @@ -1,124 +1,179 @@ # Script unificado de ejecución (`run.ps1`) -Para simplificar el trabajo del equipo hemos creado un único script que encapsula todos los comandos necesarios para: +Este script centraliza todos los comandos necesarios para trabajar en el proyecto: -- Construir la imagen Docker -- Crear el dataset smoke -- Entrenar (completo o smoke) -- Generar predicciones -- Limpiar artefactos +- Construir la imagen Docker +- Crear el dataset smoke +- Entrenar (modo desarrollo o completo) +- Generar predicciones +- Limpiar artefactos --- -## Requisitos +# Requisitos -- Docker Desktop instalado -- Dataset descargado en: `data/training_set/` +- Docker Desktop instalado +- Dataset descargado en: +``` +data/training_set/ +``` -⚠️ Si el dataset está en otra ubicación, modificar la variable `$FULL_DATA` dentro de `run.ps1`. +⚠️ Si el dataset está en otra ubicación, modificar la variable `$FULL_DATA_REL` +dentro de `run.ps1`. --- # Comandos disponibles -Nota: Si PowerShell bloquea la ejecución, ejecutar primero: - -```powershell -Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass -``` - Desde la raíz del repositorio: +--- + ## 1️⃣ Construir la imagen Docker ```powershell -.\scripts\run.ps1 build +.\run.ps1 build ``` Solo es necesario hacerlo: -- La primera vez -- Cuando cambien dependencias o el Dockerfile +- La primera vez +- Cuando cambien `requirements.txt` +- Cuando cambie el `Dockerfile` + +No es necesario al modificar `team_code.py` en modo desarrollo. + +--- ## 2️⃣ Crear dataset smoke (5 sujetos) ```powershell -.\scripts\run.ps1 smoke +.\run.ps1 smoke +``` + +Genera: + +``` +data/training_smoke/ ``` -Genera: `data/training_smoke/` +Este dataset se utiliza exclusivamente para desarrollo rápido. + +--- + +# 🚀 Modo desarrollo (rápido) -Este dataset se usa para desarrollo rápido. +Estos comandos: -## 3️⃣ Entrenar modelo +- Usan el dataset smoke +- Montan el código como volumen +- No requieren rebuild al modificar Python -### Smoke (rápido) +--- + +## 3️⃣ Entrenar en modo desarrollo ```powershell -.\scripts\run.ps1 train-smoke +.\run.ps1 train-dev ``` +Utiliza: + +- `data/training_smoke` +- `model_smoke/` -### Completo +--- + +## 4️⃣ Generar predicciones en modo desarrollo ```powershell -.\scripts\run.ps1 train +.\run.ps1 run-dev ``` -El modelo se guarda en: +Genera resultados en: -- model/ (full) -- model_smoke/ (smoke) +``` +outputs_smoke/ +``` -## 4️⃣ Generar predicciones +--- -### Smoke +## 🔁 Flujo recomendado de desarrollo ```powershell -.\scripts\run.ps1 run-smoke +.\run.ps1 build # solo la primera vez +.\run.ps1 smoke # solo si no existe +.\run.ps1 train-dev +.\run.ps1 run-dev ``` -### Completo +Este flujo debe usarse para: + +- Probar nuevas features +- Ajustar el modelo +- Depurar errores +- Iterar rápidamente + +--- + +# 🧪 Entrenamiento completo + +Solo cuando el modelo esté estable. + +--- + +## 5️⃣ Entrenar con dataset completo ```powershell -.\scripts\run.ps1 run +.\run.ps1 train ``` -Los resultados se generan en: +Guarda el modelo en: -- outputs/ -- outputs_smoke/ +``` +model/ +``` -El archivo clave es: `demographics.csv` que contiene las predicciones añadidas. +--- -## 5️⃣ Limpiar artefactos +## 6️⃣ Generar predicciones completas ```powershell -.\scripts\run.ps1 clean +.\run.ps1 run ``` -Elimina: +Genera resultados en: -- model/ -- model_smoke/ -- outputs/ -- outputs_smoke/ +``` +outputs/ +``` -No elimina datasets. +--- -# Flujo recomendado para desarrollo +# 🧹 Limpiar artefactos ```powershell -.\scripts\run.ps1 build -.\scripts\run.ps1 smoke -.\scripts\run.ps1 train-smoke -.\scripts\run.ps1 run-smoke +.\run.ps1 clean ``` -Solo cuando el modelo esté estable: +Elimina: -```powershell -.\scripts\run.ps1 train -.\scripts\run.ps1 run -``` \ No newline at end of file +- `model/` +- `model_smoke/` +- `outputs/` +- `outputs_smoke/` + +No elimina datasets. + +# Estrategia recomendada del equipo + +1. Desarrollar siempre en modo `*-dev`. +2. Entrenar en full solo antes de: + - Hacer merge a `main` + - Generar submission +3. Antes de enviar al challenge: + - Ejecutar `build` + - Ejecutar `train` + - Ejecutar `run` + - Verificar que funciona sin modo dev diff --git a/scripts/run.ps1 b/run.ps1 similarity index 100% rename from scripts/run.ps1 rename to run.ps1 From a2a52f33cdc79e849d534d8efab4c195247f193b Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 14:10:48 +0100 Subject: [PATCH 15/38] Add note for PowerShell script execution policy in run script documentation --- docs/04_run_script.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/04_run_script.md b/docs/04_run_script.md index 61cccd3..c7aba9e 100644 --- a/docs/04_run_script.md +++ b/docs/04_run_script.md @@ -22,6 +22,11 @@ data/training_set/ ⚠️ Si el dataset está en otra ubicación, modificar la variable `$FULL_DATA_REL` dentro de `run.ps1`. +⚠️ Si PowerShell bloquea la ejecución de scripts, ejecutar (aplica solo para la sesión actual): + +```powershell +Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass +``` --- # Comandos disponibles From ec74b589319e6a6ecf3768e8e1239d09774dad02 Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 16:46:26 +0100 Subject: [PATCH 16/38] Add bash run script and create smoke dataset script for model training and execution --- run.sh | 173 ++++++++++++++++++++++++++++++++++++++++ scripts/create_smoke.sh | 47 +++++++++++ 2 files changed, 220 insertions(+) create mode 100644 run.sh create mode 100644 scripts/create_smoke.sh diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..eadf715 --- /dev/null +++ b/run.sh @@ -0,0 +1,173 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " + exit 1 +fi + +COMMAND="$1" + +# ============================================ +# CONFIGURATION +# ============================================ + +FULL_DATA_REL="data/training_set" +SMOKE_DATA_REL="data/training_smoke" + +IMAGE_NAME="cinc2026" + +MODEL_FULL_REL="model" +MODEL_SMOKE_REL="model_smoke" + +OUT_FULL_REL="outputs" +OUT_SMOKE_REL="outputs_smoke" + +# ============================================ +# HELPERS +# ============================================ + +get_absolute_path() { + local rel_path="$1" + (cd "$rel_path" && pwd) +} + +ensure_directory() { + local dir_path="$1" + mkdir -p "$dir_path" +} + +build_image() { + docker build -t "$IMAGE_NAME" . +} + +create_smoke() { + echo "Creating smoke dataset..." + bash scripts/create_smoke.sh +} + +train_full() { + local full_data model_full + + full_data="$(get_absolute_path "$FULL_DATA_REL")" + model_full="$(get_absolute_path ".")/${MODEL_FULL_REL}" + + ensure_directory "$model_full" + + docker run --rm \ + -v "${full_data}:/challenge/training_data:ro" \ + -v "${model_full}:/challenge/model" \ + "$IMAGE_NAME" \ + python train_model.py -d training_data -m model -v +} + +train_smoke() { + local smoke_data model_smoke + + smoke_data="$(get_absolute_path "$SMOKE_DATA_REL")" + model_smoke="$(get_absolute_path ".")/${MODEL_SMOKE_REL}" + + ensure_directory "$model_smoke" + + docker run --rm \ + -v "${smoke_data}:/challenge/training_data:ro" \ + -v "${model_smoke}:/challenge/model" \ + "$IMAGE_NAME" \ + python train_model.py -d training_data -m model -v +} + +run_full() { + local full_data model_full out_full + + full_data="$(get_absolute_path "$FULL_DATA_REL")" + model_full="$(get_absolute_path "$MODEL_FULL_REL")" + out_full="$(get_absolute_path ".")/${OUT_FULL_REL}" + + ensure_directory "$out_full" + + docker run --rm \ + -v "${full_data}:/challenge/holdout_data:ro" \ + -v "${model_full}:/challenge/model:ro" \ + -v "${out_full}:/challenge/holdout_outputs" \ + "$IMAGE_NAME" \ + python run_model.py -d holdout_data -m model -o holdout_outputs -v +} + +run_smoke() { + local smoke_data model_smoke out_smoke + + smoke_data="$(get_absolute_path "$SMOKE_DATA_REL")" + model_smoke="$(get_absolute_path "$MODEL_SMOKE_REL")" + out_smoke="$(get_absolute_path ".")/${OUT_SMOKE_REL}" + + ensure_directory "$out_smoke" + + docker run --rm \ + -v "${smoke_data}:/challenge/holdout_data:ro" \ + -v "${model_smoke}:/challenge/model:ro" \ + -v "${out_smoke}:/challenge/holdout_outputs" \ + "$IMAGE_NAME" \ + python run_model.py -d holdout_data -m model -o holdout_outputs -v +} + +# ===================== +# DEVELOPMENT MODE (NO REBUILD) +# ===================== + +train_dev() { + local code_path smoke_data model_smoke + + code_path="$(get_absolute_path ".")" + smoke_data="$(get_absolute_path "$SMOKE_DATA_REL")" + model_smoke="${code_path}/${MODEL_SMOKE_REL}" + + ensure_directory "$model_smoke" + + docker run --rm \ + -v "${code_path}:/challenge" \ + -v "${smoke_data}:/challenge/training_data:ro" \ + -v "${model_smoke}:/challenge/model" \ + "$IMAGE_NAME" \ + python train_model.py -d training_data -m model -v +} + +run_dev() { + local code_path smoke_data model_smoke out_smoke + + code_path="$(get_absolute_path ".")" + smoke_data="$(get_absolute_path "$SMOKE_DATA_REL")" + model_smoke="$(get_absolute_path "$MODEL_SMOKE_REL")" + out_smoke="${code_path}/${OUT_SMOKE_REL}" + + ensure_directory "$out_smoke" + + docker run --rm \ + -v "${code_path}:/challenge" \ + -v "${smoke_data}:/challenge/holdout_data:ro" \ + -v "${model_smoke}:/challenge/model:ro" \ + -v "${out_smoke}:/challenge/holdout_outputs" \ + "$IMAGE_NAME" \ + python run_model.py -d holdout_data -m model -o holdout_outputs -v +} + +clean_all() { + rm -rf "$MODEL_FULL_REL" "$MODEL_SMOKE_REL" "$OUT_FULL_REL" "$OUT_SMOKE_REL" + echo "Models and outputs removed." +} + +case "$COMMAND" in + build) build_image ;; + smoke) create_smoke ;; + train) train_full ;; + train-smoke) train_smoke ;; + run) run_full ;; + run-smoke) run_smoke ;; + train-dev) train_dev ;; + run-dev) run_dev ;; + clean) clean_all ;; + *) + echo "Invalid command: $COMMAND" + echo "Valid commands: build, smoke, train, train-smoke, run, run-smoke, train-dev, run-dev, clean" + exit 1 + ;; +esac diff --git a/scripts/create_smoke.sh b/scripts/create_smoke.sh new file mode 100644 index 0000000..34bd020 --- /dev/null +++ b/scripts/create_smoke.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ============================================ +# Create smoke training dataset +# ============================================ + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# IMPORTANT: +# Each team member can modify this path to +# match their local dataset location. +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +FULL_DATA_PATH="${FULL_DATA_PATH:-data/training_set}" # Override with env var if needed + +SMOKE_PATH="data/training_smoke" +N_RECORDS="${N_RECORDS:-5}" + +echo "Creating smoke dataset..." +echo "Source: ${FULL_DATA_PATH}" +echo "Destination: ${SMOKE_PATH}" + +rm -rf "${SMOKE_PATH}" +mkdir -p "${SMOKE_PATH}" + +# Copy demographics +cp "${FULL_DATA_PATH}/demographics.csv" "${SMOKE_PATH}/demographics.csv" + +# Select first N EDF files +while IFS= read -r file_path; do + rel_path="${file_path#${FULL_DATA_PATH}/}" + target_path="${SMOKE_PATH}/${rel_path}" + mkdir -p "$(dirname "${target_path}")" + cp "${file_path}" "${target_path}" +done < <( + find "${FULL_DATA_PATH}/physiological_data" -type f -name "*.edf" | sort | head -n "${N_RECORDS}" +) + +# Copy full annotation folders (simpler and robust) +if [[ -d "${FULL_DATA_PATH}/algorithmic_annotations" ]]; then + cp -R "${FULL_DATA_PATH}/algorithmic_annotations" "${SMOKE_PATH}/algorithmic_annotations" +fi + +if [[ -d "${FULL_DATA_PATH}/human_annotations" ]]; then + cp -R "${FULL_DATA_PATH}/human_annotations" "${SMOKE_PATH}/human_annotations" +fi + +echo "Smoke dataset created successfully." From 67d9b636afe1f2bfaad8a5019ed7e96ac13e7621 Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 16:52:17 +0100 Subject: [PATCH 17/38] Update Docker and smoke dataset documentation for clarity and structure --- docs/02_docker.md | 55 ++++++------ docs/03_smoke_dataset.md | 51 +++++------ docs/04_run_script.md | 177 +++++++++++---------------------------- 3 files changed, 91 insertions(+), 192 deletions(-) diff --git a/docs/02_docker.md b/docs/02_docker.md index eca7735..c24ced2 100644 --- a/docs/02_docker.md +++ b/docs/02_docker.md @@ -1,53 +1,46 @@ # Uso de Docker +Este documento define el contexto de ejecución con Docker. + ## Requisitos - Docker Desktop instalado (modo Linux containers) - Dataset descargado desde Kaggle -- Se asume que el dataset está en: data/training_set/ +- Dataset completo disponible en `data/training_set/` (ruta por defecto del proyecto) -Cada miembro del equipo puede tener el dataset en una ubicación diferente, pero en este repositorio asumimos que está dentro de `data/`. +Si tu dataset está en otra ubicación, actualiza la variable de ruta en el script de ejecución. ---- +## Estructura de trabajo -## Construir la imagen +Entradas: -Desde la raíz del repositorio: +- `data/training_set/` (dataset completo) +- `data/training_smoke/` (dataset reducido para modo desarrollo (smoke)) -```powershell -docker build -t cinc2026 . -``` +Salidas: -## Entenar con el dataset completo +- `model/` y `outputs/` (flujo completo) +- `model_smoke/` y `outputs_smoke/` (flujo smoke/desarrollo) -```powershell -$DATA="data/training_set" -$MODEL="$PWD/model" +## Orden recomendado de ejecución -docker run --rm ` - -v "${DATA}:/challenge/training_data:ro" ` - -v "${MODEL}:/challenge/model" ` - cinc2026 ` - python train_model.py -d training_data -m model -v -``` +1. Construir imagen Docker (`build`) +2. Preparar dataset smoke (`smoke`) +3. Iterar en modo desarrollo (smoke) (`train-dev` / `run-dev`) +4. Ejecutar validación completa (`train` / `run`) +5. Limpiar artefactos cuando corresponda (`clean`) -## Generar predicciones +La guía paso a paso está en `docs/04_run_script.md`. -```powershell -$OUT="$PWD/outputs" +## Compatibilidad de scripts -docker run --rm ` - -v "${DATA}:/challenge/holdout_data:ro" ` - -v "${MODEL}:/challenge/model:ro" ` - -v "${OUT}:/challenge/holdout_outputs" ` - cinc2026 ` - python run_model.py -d holdout_data -m model -o holdout_outputs -v -``` +El flujo principal del equipo está documentado con `run.sh` (Git Bash). +También existen equivalentes en PowerShell: `run.ps1` y `scripts/create_smoke.ps1`. ## Resultado esperado -En la carpeta `outputs/` se generará un `demographics.csv` con: +Tras ejecutar la generación de predicciones (inferencia) completa, en `outputs/` se genera un `demographics.csv` con: - Columnas originales -- Cognitive_Impairment -- Cognitive_Impairment_Probability \ No newline at end of file +- `Cognitive_Impairment` +- `Cognitive_Impairment_Probability` \ No newline at end of file diff --git a/docs/03_smoke_dataset.md b/docs/03_smoke_dataset.md index f91249d..5ba827b 100644 --- a/docs/03_smoke_dataset.md +++ b/docs/03_smoke_dataset.md @@ -1,51 +1,40 @@ -# Dataset Smoke (Desarrollo Rápido) +# Dataset smoke (Modo desarrollo) Entrenar con el dataset completo tarda aproximadamente 30–40 minutos con el modelo de ejemplo. Para desarrollo utilizamos un dataset reducido (5 sujetos). ---- - -## Crear dataset smoke +Este documento describe cuándo y por qué usar smoke. +Los comandos de ejecución están centralizados en `docs/04_run_script.md`. -Ejecutar: +--- -```powershell -powershell -ExecutionPolicy Bypass -File scripts/create_smoke.ps1 -``` +## Qué incluye -Esto generará: `data/training_smoke/` +- Muestra reducida del dataset (5 sujetos) +- Estructura compatible con el flujo oficial del proyecto +- Directorio de salida en `data/training_smoke/` -## Entrenar con el smoke dataset +## Para qué se usa -```powershell -$DATA="$PWD\data\training_smoke" -$MODEL="$PWD\model_smoke" +- Validar cambios de código rápidamente +- Detectar errores de integración antes del entrenamiento completo +- Iterar en modo desarrollo (smoke) sin esperar ciclos largos -docker run --rm ` - -v "${DATA}:/challenge/training_data:ro" ` - -v "${MODEL}:/challenge/model" ` - cinc2026 ` - python train_model.py -d training_data -m model -v -``` +## Artefactos asociados -## Generar predicciones con smoke dataset +- Entrenamiento smoke: `model_smoke/` +- Predicciones (inferencia) smoke: `outputs_smoke/` -```powershell -$OUT="$PWD/outputs_smoke" +## Relación con el flujo principal -docker run --rm ` - -v "${DATA}:/challenge/holdout_data:ro" ` - -v "${MODEL}:/challenge/model:ro" ` - -v "${OUT}:/challenge/holdout_outputs" ` - cinc2026 ` - python run_model.py -d holdout_data -m model -o holdout_outputs -v -``` +El dataset smoke se crea al inicio del ciclo de desarrollo y se usa junto con `train-dev` y `run-dev`. +El orden detallado de ejecución está en `docs/04_run_script.md`. ## ¿Cuándo usar smoke? -- Desarrollo de nuevas features +- Desarrollo de nuevas funcionalidades - Comprobación rápida de que el código no rompe -- Validación de cambios en team_code.py +- Validación de cambios en `team_code.py` Nunca usar smoke para evaluar rendimiento final. \ No newline at end of file diff --git a/docs/04_run_script.md b/docs/04_run_script.md index c7aba9e..2ee0517 100644 --- a/docs/04_run_script.md +++ b/docs/04_run_script.md @@ -1,12 +1,7 @@ -# Script unificado de ejecución (`run.ps1`) +# Script unificado de ejecución (`run.sh`) -Este script centraliza todos los comandos necesarios para trabajar en el proyecto: - -- Construir la imagen Docker -- Crear el dataset smoke -- Entrenar (modo desarrollo o completo) -- Generar predicciones -- Limpiar artefactos +Este documento es la guía operativa única para ejecutar el proyecto. +Aquí se define el orden recomendado y los comandos asociados. --- @@ -20,165 +15,87 @@ data/training_set/ ``` ⚠️ Si el dataset está en otra ubicación, modificar la variable `$FULL_DATA_REL` -dentro de `run.ps1`. - -⚠️ Si PowerShell bloquea la ejecución de scripts, ejecutar (aplica solo para la sesión actual): - -```powershell -Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass -``` ---- +dentro de `run.sh`. -# Comandos disponibles +⚠️ Ejecutar los comandos desde Git Bash. -Desde la raíz del repositorio: +ℹ️ Existen scripts equivalentes en PowerShell (`run.ps1` y `scripts/create_smoke.ps1`) para quienes prefieran ese entorno. +ℹ️ Para contexto general y definición de artefactos, ver `docs/02_docker.md` y `docs/03_smoke_dataset.md`. --- -## 1️⃣ Construir la imagen Docker - -```powershell -.\run.ps1 build -``` - -Solo es necesario hacerlo: - -- La primera vez -- Cuando cambien `requirements.txt` -- Cuando cambie el `Dockerfile` +# Orden de ejecución recomendado -No es necesario al modificar `team_code.py` en modo desarrollo. +Desde la raíz del repositorio. ---- +## 1) Preparar entorno -## 2️⃣ Crear dataset smoke (5 sujetos) +### Construir imagen Docker -```powershell -.\run.ps1 smoke +```bash +./run.sh build ``` -Genera: +Ejecutar la primera vez y cada vez que cambien `requirements.txt` o `Dockerfile`. -``` -data/training_smoke/ -``` +### Crear dataset smoke (5 sujetos) -Este dataset se utiliza exclusivamente para desarrollo rápido. - ---- - -# 🚀 Modo desarrollo (rápido) - -Estos comandos: - -- Usan el dataset smoke -- Montan el código como volumen -- No requieren rebuild al modificar Python - ---- - -## 3️⃣ Entrenar en modo desarrollo - -```powershell -.\run.ps1 train-dev +```bash +./run.sh smoke ``` -Utiliza: - -- `data/training_smoke` -- `model_smoke/` - ---- - -## 4️⃣ Generar predicciones en modo desarrollo +Genera `data/training_smoke/`. -```powershell -.\run.ps1 run-dev -``` +## 2) Ciclo en modo desarrollo (smoke) -Genera resultados en: +### Entrenar en modo desarrollo (smoke) -``` -outputs_smoke/ +```bash +./run.sh train-dev ``` ---- +Usa `data/training_smoke/` y guarda modelo en `model_smoke/`. -## 🔁 Flujo recomendado de desarrollo +### Generar predicciones (inferencia) en modo desarrollo (smoke) -```powershell -.\run.ps1 build # solo la primera vez -.\run.ps1 smoke # solo si no existe -.\run.ps1 train-dev -.\run.ps1 run-dev +```bash +./run.sh run-dev ``` -Este flujo debe usarse para: - -- Probar nuevas features -- Ajustar el modelo -- Depurar errores -- Iterar rápidamente - ---- - -# 🧪 Entrenamiento completo - -Solo cuando el modelo esté estable. - ---- +Genera resultados en `outputs_smoke/`. -## 5️⃣ Entrenar con dataset completo +### Secuencia típica en modo desarrollo (smoke) -```powershell -.\run.ps1 train +```bash +./run.sh build # solo la primera vez +./run.sh smoke # solo si no existe +./run.sh train-dev +./run.sh run-dev ``` -Guarda el modelo en: +## 3) Validación completa -``` -model/ -``` - ---- - -## 6️⃣ Generar predicciones completas +### Entrenar con dataset completo -```powershell -.\run.ps1 run +```bash +./run.sh train ``` -Genera resultados en: +Guarda el modelo en `model/`. -``` -outputs/ +### Generar predicciones (inferencia) completas + +```bash +./run.sh run ``` ---- +Genera resultados en `outputs/`. -# 🧹 Limpiar artefactos +## 4) Limpieza de artefactos -```powershell -.\run.ps1 clean +```bash +./run.sh clean ``` -Elimina: - -- `model/` -- `model_smoke/` -- `outputs/` -- `outputs_smoke/` - +Elimina `model/`, `model_smoke/`, `outputs/` y `outputs_smoke/`. No elimina datasets. - -# Estrategia recomendada del equipo - -1. Desarrollar siempre en modo `*-dev`. -2. Entrenar en full solo antes de: - - Hacer merge a `main` - - Generar submission -3. Antes de enviar al challenge: - - Ejecutar `build` - - Ejecutar `train` - - Ejecutar `run` - - Verificar que funciona sin modo dev From 819ca187fbc500381eff1a511fe3c58fcebc983d Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 18:28:37 +0100 Subject: [PATCH 18/38] Refactor Docker commands in run script for improved path handling and consistency --- run.sh | 86 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 28 deletions(-) diff --git a/run.sh b/run.sh index eadf715..fed6e06 100644 --- a/run.sh +++ b/run.sh @@ -37,8 +37,22 @@ ensure_directory() { mkdir -p "$dir_path" } +to_docker_path() { + local host_path="$1" + + if command -v cygpath >/dev/null 2>&1; then + cygpath -m "$host_path" + else + echo "$host_path" + fi +} + +docker_cli() { + MSYS_NO_PATHCONV=1 MSYS2_ARG_CONV_EXCL="*" docker "$@" +} + build_image() { - docker build -t "$IMAGE_NAME" . + docker_cli build -t "$IMAGE_NAME" . } create_smoke() { @@ -48,64 +62,78 @@ create_smoke() { train_full() { local full_data model_full + local full_data_docker model_full_docker full_data="$(get_absolute_path "$FULL_DATA_REL")" model_full="$(get_absolute_path ".")/${MODEL_FULL_REL}" + full_data_docker="$(to_docker_path "$full_data")" + model_full_docker="$(to_docker_path "$model_full")" ensure_directory "$model_full" - docker run --rm \ - -v "${full_data}:/challenge/training_data:ro" \ - -v "${model_full}:/challenge/model" \ + docker_cli run --rm \ + -v "${full_data_docker}:/challenge/training_data:ro" \ + -v "${model_full_docker}:/challenge/model" \ "$IMAGE_NAME" \ python train_model.py -d training_data -m model -v } train_smoke() { local smoke_data model_smoke + local smoke_data_docker model_smoke_docker smoke_data="$(get_absolute_path "$SMOKE_DATA_REL")" model_smoke="$(get_absolute_path ".")/${MODEL_SMOKE_REL}" + smoke_data_docker="$(to_docker_path "$smoke_data")" + model_smoke_docker="$(to_docker_path "$model_smoke")" ensure_directory "$model_smoke" - docker run --rm \ - -v "${smoke_data}:/challenge/training_data:ro" \ - -v "${model_smoke}:/challenge/model" \ + docker_cli run --rm \ + -v "${smoke_data_docker}:/challenge/training_data:ro" \ + -v "${model_smoke_docker}:/challenge/model" \ "$IMAGE_NAME" \ python train_model.py -d training_data -m model -v } run_full() { local full_data model_full out_full + local full_data_docker model_full_docker out_full_docker full_data="$(get_absolute_path "$FULL_DATA_REL")" model_full="$(get_absolute_path "$MODEL_FULL_REL")" out_full="$(get_absolute_path ".")/${OUT_FULL_REL}" + full_data_docker="$(to_docker_path "$full_data")" + model_full_docker="$(to_docker_path "$model_full")" + out_full_docker="$(to_docker_path "$out_full")" ensure_directory "$out_full" - docker run --rm \ - -v "${full_data}:/challenge/holdout_data:ro" \ - -v "${model_full}:/challenge/model:ro" \ - -v "${out_full}:/challenge/holdout_outputs" \ + docker_cli run --rm \ + -v "${full_data_docker}:/challenge/holdout_data:ro" \ + -v "${model_full_docker}:/challenge/model:ro" \ + -v "${out_full_docker}:/challenge/holdout_outputs" \ "$IMAGE_NAME" \ python run_model.py -d holdout_data -m model -o holdout_outputs -v } run_smoke() { local smoke_data model_smoke out_smoke + local smoke_data_docker model_smoke_docker out_smoke_docker smoke_data="$(get_absolute_path "$SMOKE_DATA_REL")" model_smoke="$(get_absolute_path "$MODEL_SMOKE_REL")" out_smoke="$(get_absolute_path ".")/${OUT_SMOKE_REL}" + smoke_data_docker="$(to_docker_path "$smoke_data")" + model_smoke_docker="$(to_docker_path "$model_smoke")" + out_smoke_docker="$(to_docker_path "$out_smoke")" ensure_directory "$out_smoke" - docker run --rm \ - -v "${smoke_data}:/challenge/holdout_data:ro" \ - -v "${model_smoke}:/challenge/model:ro" \ - -v "${out_smoke}:/challenge/holdout_outputs" \ + docker_cli run --rm \ + -v "${smoke_data_docker}:/challenge/holdout_data:ro" \ + -v "${model_smoke_docker}:/challenge/model:ro" \ + -v "${out_smoke_docker}:/challenge/holdout_outputs" \ "$IMAGE_NAME" \ python run_model.py -d holdout_data -m model -o holdout_outputs -v } @@ -116,38 +144,40 @@ run_smoke() { train_dev() { local code_path smoke_data model_smoke + local code_path_docker smoke_data_docker code_path="$(get_absolute_path ".")" smoke_data="$(get_absolute_path "$SMOKE_DATA_REL")" model_smoke="${code_path}/${MODEL_SMOKE_REL}" + code_path_docker="$(to_docker_path "$code_path")" + smoke_data_docker="$(to_docker_path "$smoke_data")" ensure_directory "$model_smoke" - docker run --rm \ - -v "${code_path}:/challenge" \ - -v "${smoke_data}:/challenge/training_data:ro" \ - -v "${model_smoke}:/challenge/model" \ + docker_cli run --rm \ + -v "${code_path_docker}:/challenge" \ + -v "${smoke_data_docker}:/challenge/data_smoke:ro" \ "$IMAGE_NAME" \ - python train_model.py -d training_data -m model -v + python train_model.py -d /challenge/data_smoke -m /challenge/model_smoke -v } run_dev() { - local code_path smoke_data model_smoke out_smoke + local code_path smoke_data out_smoke + local code_path_docker smoke_data_docker code_path="$(get_absolute_path ".")" smoke_data="$(get_absolute_path "$SMOKE_DATA_REL")" - model_smoke="$(get_absolute_path "$MODEL_SMOKE_REL")" out_smoke="${code_path}/${OUT_SMOKE_REL}" + code_path_docker="$(to_docker_path "$code_path")" + smoke_data_docker="$(to_docker_path "$smoke_data")" ensure_directory "$out_smoke" - docker run --rm \ - -v "${code_path}:/challenge" \ - -v "${smoke_data}:/challenge/holdout_data:ro" \ - -v "${model_smoke}:/challenge/model:ro" \ - -v "${out_smoke}:/challenge/holdout_outputs" \ + docker_cli run --rm \ + -v "${code_path_docker}:/challenge" \ + -v "${smoke_data_docker}:/challenge/data_smoke:ro" \ "$IMAGE_NAME" \ - python run_model.py -d holdout_data -m model -o holdout_outputs -v + python run_model.py -d /challenge/data_smoke -m /challenge/model_smoke -o /challenge/outputs_smoke -v } clean_all() { From 1ebe7a86a58f5c4e891bfaefd1374cd5933cb5cc Mon Sep 17 00:00:00 2001 From: dcajal Date: Tue, 3 Mar 2026 18:29:24 +0100 Subject: [PATCH 19/38] Move lolai_models to src --- {models => src}/lolai_models.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {models => src}/lolai_models.py (100%) diff --git a/models/lolai_models.py b/src/lolai_models.py similarity index 100% rename from models/lolai_models.py rename to src/lolai_models.py From b7cc2ea87c41fe29de934b0c32dbbc6369122a22 Mon Sep 17 00:00:00 2001 From: dcajal Date: Mon, 9 Mar 2026 13:12:46 +0100 Subject: [PATCH 20/38] Add evaluation commands for existing predictions in run scripts and documentation --- docs/04_run_script.md | 29 ++++++++++++++-- run.ps1 | 55 +++++++++++++++++++++++++++++ run.sh | 80 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 160 insertions(+), 4 deletions(-) diff --git a/docs/04_run_script.md b/docs/04_run_script.md index 2ee0517..e5f5907 100644 --- a/docs/04_run_script.md +++ b/docs/04_run_script.md @@ -62,7 +62,15 @@ Usa `data/training_smoke/` y guarda modelo en `model_smoke/`. ./run.sh run-dev ``` -Genera resultados en `outputs_smoke/`. +Genera resultados en `outputs_smoke/` y luego imprime métricas de evaluación en consola. + +### Evaluar predicciones existentes en modo desarrollo (smoke) + +```bash +./run.sh eval-dev +``` + +Reutiliza `outputs_smoke/demographics.csv` y muestra AUROC, AUPRC, Accuracy y F-measure sin volver a ejecutar inferencia. ### Secuencia típica en modo desarrollo (smoke) @@ -71,6 +79,7 @@ Genera resultados en `outputs_smoke/`. ./run.sh smoke # solo si no existe ./run.sh train-dev ./run.sh run-dev +./run.sh eval-dev # opcional: reevaluar sin correr inferencia ``` ## 3) Validación completa @@ -89,7 +98,23 @@ Guarda el modelo en `model/`. ./run.sh run ``` -Genera resultados en `outputs/`. +Genera resultados en `outputs/` y luego imprime métricas de evaluación en consola. + +### Evaluar predicciones existentes completas + +```bash +./run.sh eval +``` + +Reutiliza `outputs/demographics.csv` y muestra AUROC, AUPRC, Accuracy y F-measure sin volver a ejecutar inferencia. + +### Evaluar predicciones existentes del dataset smoke + +```bash +./run.sh eval-smoke +``` + +Reutiliza `outputs_smoke/demographics.csv` y muestra AUROC, AUPRC, Accuracy y F-measure sin volver a ejecutar inferencia. ## 4) Limpieza de artefactos diff --git a/run.ps1 b/run.ps1 index 41a9df7..439eade 100644 --- a/run.ps1 +++ b/run.ps1 @@ -7,8 +7,11 @@ param( "train-smoke", "run", "run-smoke", + "eval", + "eval-smoke", "train-dev", "run-dev", + "eval-dev", "clean" )] [string]$Command @@ -33,6 +36,7 @@ $MODEL_SMOKE_REL = "model_smoke" $OUT_FULL_REL = "outputs" $OUT_SMOKE_REL = "outputs_smoke" +$DEMOGRAPHICS_FILE = "demographics.csv" # ============================================ # FUNCIONES AUXILIARES @@ -48,6 +52,24 @@ function Ensure-Directory($path) { } } +function Invoke-Evaluation($DataPath, $OutputPath, $Label) { + Write-Host "Evaluating $Label predictions..." + docker run --rm ` + -v "${DataPath}:/challenge/eval_data:ro" ` + -v "${OutputPath}:/challenge/eval_outputs:ro" ` + $IMAGE_NAME ` + python evaluate_model.py -d "/challenge/eval_data/$DEMOGRAPHICS_FILE" -o "/challenge/eval_outputs/$DEMOGRAPHICS_FILE" +} + +function Invoke-EvaluationDev($CodePath, $DataPath, $OutputPath, $Label) { + Write-Host "Evaluating $Label predictions..." + docker run --rm ` + -v "${CodePath}:/challenge" ` + -v "${DataPath}:/challenge/eval_data:ro" ` + $IMAGE_NAME ` + python evaluate_model.py -d "/challenge/eval_data/$DEMOGRAPHICS_FILE" -o "$OutputPath/$DEMOGRAPHICS_FILE" +} + # ============================================ # COMANDOS # ============================================ @@ -103,6 +125,8 @@ function Run-Full { -v "${OUT_FULL}:/challenge/holdout_outputs" ` $IMAGE_NAME ` python run_model.py -d holdout_data -m model -o holdout_outputs -v + + Invoke-Evaluation $FULL_DATA $OUT_FULL "full-dataset" } function Run-Smoke { @@ -119,6 +143,24 @@ function Run-Smoke { -v "${OUT_SMOKE}:/challenge/holdout_outputs" ` $IMAGE_NAME ` python run_model.py -d holdout_data -m model -o holdout_outputs -v + + Invoke-Evaluation $SMOKE_DATA $OUT_SMOKE "smoke" +} + +function Eval-Full { + + $FULL_DATA = Get-AbsolutePath $FULL_DATA_REL + $OUT_FULL = Get-AbsolutePath $OUT_FULL_REL + + Invoke-Evaluation $FULL_DATA $OUT_FULL "full-dataset" +} + +function Eval-Smoke { + + $SMOKE_DATA = Get-AbsolutePath $SMOKE_DATA_REL + $OUT_SMOKE = Get-AbsolutePath $OUT_SMOKE_REL + + Invoke-Evaluation $SMOKE_DATA $OUT_SMOKE "smoke" } # ====================== @@ -157,6 +199,16 @@ function Run-Dev { -v "${OUT_SMOKE}:/challenge/holdout_outputs" ` $IMAGE_NAME ` python run_model.py -d holdout_data -m model -o holdout_outputs -v + + Invoke-EvaluationDev $CODE_PATH $SMOKE_DATA "/challenge/holdout_outputs" "development smoke" +} + +function Eval-Dev { + + $CODE_PATH = Get-AbsolutePath "." + $SMOKE_DATA = Get-AbsolutePath $SMOKE_DATA_REL + + Invoke-EvaluationDev $CODE_PATH $SMOKE_DATA "/challenge/holdout_outputs" "development smoke" } function Clean-All { @@ -181,8 +233,11 @@ switch ($Command) { "train-smoke" { Train-Smoke } "run" { Run-Full } "run-smoke" { Run-Smoke } + "eval" { Eval-Full } + "eval-smoke" { Eval-Smoke } "train-dev" { Train-Dev } "run-dev" { Run-Dev } + "eval-dev" { Eval-Dev } "clean" { Clean-All } } \ No newline at end of file diff --git a/run.sh b/run.sh index fed6e06..df73527 100644 --- a/run.sh +++ b/run.sh @@ -2,7 +2,7 @@ set -euo pipefail if [[ $# -lt 1 ]]; then - echo "Usage: $0 " + echo "Usage: $0 " exit 1 fi @@ -22,6 +22,7 @@ MODEL_SMOKE_REL="model_smoke" OUT_FULL_REL="outputs" OUT_SMOKE_REL="outputs_smoke" +DEMOGRAPHICS_FILE="demographics.csv" # ============================================ # HELPERS @@ -51,6 +52,45 @@ docker_cli() { MSYS_NO_PATHCONV=1 MSYS2_ARG_CONV_EXCL="*" docker "$@" } +evaluate_predictions() { + local data_dir="$1" + local output_dir="$2" + local label="$3" + local data_dir_docker output_dir_docker + + data_dir_docker="$(to_docker_path "$data_dir")" + output_dir_docker="$(to_docker_path "$output_dir")" + + echo "Evaluating ${label} predictions..." + docker_cli run --rm \ + -v "${data_dir_docker}:/challenge/eval_data:ro" \ + -v "${output_dir_docker}:/challenge/eval_outputs:ro" \ + "$IMAGE_NAME" \ + python evaluate_model.py \ + -d "/challenge/eval_data/${DEMOGRAPHICS_FILE}" \ + -o "/challenge/eval_outputs/${DEMOGRAPHICS_FILE}" +} + +evaluate_predictions_dev() { + local code_path="$1" + local data_path="$2" + local output_path="$3" + local label="$4" + local code_path_docker data_path_docker + + code_path_docker="$(to_docker_path "$code_path")" + data_path_docker="$(to_docker_path "$data_path")" + + echo "Evaluating ${label} predictions..." + docker_cli run --rm \ + -v "${code_path_docker}:/challenge" \ + -v "${data_path_docker}:/challenge/eval_data:ro" \ + "$IMAGE_NAME" \ + python evaluate_model.py \ + -d "/challenge/eval_data/${DEMOGRAPHICS_FILE}" \ + -o "$output_path/${DEMOGRAPHICS_FILE}" +} + build_image() { docker_cli build -t "$IMAGE_NAME" . } @@ -115,6 +155,8 @@ run_full() { -v "${out_full_docker}:/challenge/holdout_outputs" \ "$IMAGE_NAME" \ python run_model.py -d holdout_data -m model -o holdout_outputs -v + + evaluate_predictions "$full_data" "$out_full" "full-dataset" } run_smoke() { @@ -136,6 +178,26 @@ run_smoke() { -v "${out_smoke_docker}:/challenge/holdout_outputs" \ "$IMAGE_NAME" \ python run_model.py -d holdout_data -m model -o holdout_outputs -v + + evaluate_predictions "$smoke_data" "$out_smoke" "smoke" +} + +eval_full() { + local full_data out_full + + full_data="$(get_absolute_path "$FULL_DATA_REL")" + out_full="$(get_absolute_path "$OUT_FULL_REL")" + + evaluate_predictions "$full_data" "$out_full" "full-dataset" +} + +eval_smoke() { + local smoke_data out_smoke + + smoke_data="$(get_absolute_path "$SMOKE_DATA_REL")" + out_smoke="$(get_absolute_path "$OUT_SMOKE_REL")" + + evaluate_predictions "$smoke_data" "$out_smoke" "smoke" } # ===================== @@ -178,6 +240,17 @@ run_dev() { -v "${smoke_data_docker}:/challenge/data_smoke:ro" \ "$IMAGE_NAME" \ python run_model.py -d /challenge/data_smoke -m /challenge/model_smoke -o /challenge/outputs_smoke -v + + evaluate_predictions_dev "$code_path" "$smoke_data" "/challenge/outputs_smoke" "development smoke" +} + +eval_dev() { + local code_path smoke_data + + code_path="$(get_absolute_path ".")" + smoke_data="$(get_absolute_path "$SMOKE_DATA_REL")" + + evaluate_predictions_dev "$code_path" "$smoke_data" "/challenge/outputs_smoke" "development smoke" } clean_all() { @@ -192,12 +265,15 @@ case "$COMMAND" in train-smoke) train_smoke ;; run) run_full ;; run-smoke) run_smoke ;; + eval) eval_full ;; + eval-smoke) eval_smoke ;; train-dev) train_dev ;; run-dev) run_dev ;; + eval-dev) eval_dev ;; clean) clean_all ;; *) echo "Invalid command: $COMMAND" - echo "Valid commands: build, smoke, train, train-smoke, run, run-smoke, train-dev, run-dev, clean" + echo "Valid commands: build, smoke, train, train-smoke, run, run-smoke, eval, eval-smoke, train-dev, run-dev, eval-dev, clean" exit 1 ;; esac From 03fc272d88c6b4b171894593d353ccf74b1363b0 Mon Sep 17 00:00:00 2001 From: dcajal Date: Fri, 27 Mar 2026 11:00:43 +0100 Subject: [PATCH 21/38] Refactor dataset handling in run scripts to support supplementary data and improve evaluation logic --- docs/04_run_script.md | 8 ++++++-- run.ps1 | 37 ++++++++++++++++++++++++++++--------- run.sh | 38 +++++++++++++++++++++++++++----------- 3 files changed, 61 insertions(+), 22 deletions(-) diff --git a/docs/04_run_script.md b/docs/04_run_script.md index e5f5907..310bb5c 100644 --- a/docs/04_run_script.md +++ b/docs/04_run_script.md @@ -12,9 +12,10 @@ Aquí se define el orden recomendado y los comandos asociados. ``` data/training_set/ +data/supplementary_set/ ``` -⚠️ Si el dataset está en otra ubicación, modificar la variable `$FULL_DATA_REL` +⚠️ Si el dataset está en otra ubicación, modificar las variables `$TRAIN_DATA_REL` y `$RUN_DATA_REL` dentro de `run.sh`. ⚠️ Ejecutar los comandos desde Git Bash. @@ -98,7 +99,8 @@ Guarda el modelo en `model/`. ./run.sh run ``` -Genera resultados en `outputs/` y luego imprime métricas de evaluación en consola. +Genera resultados en `outputs/` usando `data/supplementary_set/`. +Si el dataset no tiene etiquetas (como en `supplementary_set`), el script omite la evaluación automáticamente. ### Evaluar predicciones existentes completas @@ -107,6 +109,8 @@ Genera resultados en `outputs/` y luego imprime métricas de evaluación en cons ``` Reutiliza `outputs/demographics.csv` y muestra AUROC, AUPRC, Accuracy y F-measure sin volver a ejecutar inferencia. +Evalúa contra `data/supplementary_set/`. +Si no hay etiquetas en ese set, el script omite la evaluación automáticamente. ### Evaluar predicciones existentes del dataset smoke diff --git a/run.ps1 b/run.ps1 index 439eade..b480a55 100644 --- a/run.ps1 +++ b/run.ps1 @@ -23,10 +23,11 @@ param( # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # IMPORTANTE: -# Si tu dataset no está en data/training_set, -# modifica esta ruta. +# Si tu dataset no está en data/training_set o data/supplementary_set, +# modifica estas rutas. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -$FULL_DATA_REL = "data/training_set" +$TRAIN_DATA_REL = "data/training_set" +$RUN_DATA_REL = "data/supplementary_set" $SMOKE_DATA_REL = "data/training_smoke" $IMAGE_NAME = "cinc2026" @@ -70,6 +71,16 @@ function Invoke-EvaluationDev($CodePath, $DataPath, $OutputPath, $Label) { python evaluate_model.py -d "/challenge/eval_data/$DEMOGRAPHICS_FILE" -o "$OutputPath/$DEMOGRAPHICS_FILE" } +function Test-DatasetHasLabels($DataPath) { + $demographicsPath = Join-Path $DataPath $DEMOGRAPHICS_FILE + if (!(Test-Path $demographicsPath)) { + return $false + } + + $header = Get-Content -Path $demographicsPath -TotalCount 1 + return $header -match "Cognitive_Impairment" +} + # ============================================ # COMANDOS # ============================================ @@ -85,7 +96,7 @@ function Create-Smoke { function Train-Full { - $FULL_DATA = Get-AbsolutePath $FULL_DATA_REL + $FULL_DATA = Get-AbsolutePath $TRAIN_DATA_REL $MODEL_FULL = Join-Path (Get-AbsolutePath ".") $MODEL_FULL_REL Ensure-Directory $MODEL_FULL @@ -113,20 +124,24 @@ function Train-Smoke { function Run-Full { - $FULL_DATA = Get-AbsolutePath $FULL_DATA_REL + $RUN_DATA = Get-AbsolutePath $RUN_DATA_REL $MODEL_FULL = Get-AbsolutePath $MODEL_FULL_REL $OUT_FULL = Join-Path (Get-AbsolutePath ".") $OUT_FULL_REL Ensure-Directory $OUT_FULL docker run --rm ` - -v "${FULL_DATA}:/challenge/holdout_data:ro" ` + -v "${RUN_DATA}:/challenge/holdout_data:ro" ` -v "${MODEL_FULL}:/challenge/model:ro" ` -v "${OUT_FULL}:/challenge/holdout_outputs" ` $IMAGE_NAME ` python run_model.py -d holdout_data -m model -o holdout_outputs -v - Invoke-Evaluation $FULL_DATA $OUT_FULL "full-dataset" + if (Test-DatasetHasLabels $RUN_DATA) { + Invoke-Evaluation $RUN_DATA $OUT_FULL "run-dataset" + } else { + Write-Host "Skipping evaluation for run dataset (labels not present in $RUN_DATA_REL/$DEMOGRAPHICS_FILE)." + } } function Run-Smoke { @@ -149,10 +164,14 @@ function Run-Smoke { function Eval-Full { - $FULL_DATA = Get-AbsolutePath $FULL_DATA_REL + $RUN_DATA = Get-AbsolutePath $RUN_DATA_REL $OUT_FULL = Get-AbsolutePath $OUT_FULL_REL - Invoke-Evaluation $FULL_DATA $OUT_FULL "full-dataset" + if (Test-DatasetHasLabels $RUN_DATA) { + Invoke-Evaluation $RUN_DATA $OUT_FULL "run-dataset" + } else { + Write-Host "Skipping evaluation for run dataset (labels not present in $RUN_DATA_REL/$DEMOGRAPHICS_FILE)." + } } function Eval-Smoke { diff --git a/run.sh b/run.sh index df73527..18b7e1e 100644 --- a/run.sh +++ b/run.sh @@ -12,7 +12,8 @@ COMMAND="$1" # CONFIGURATION # ============================================ -FULL_DATA_REL="data/training_set" +TRAIN_DATA_REL="data/training_set" +RUN_DATA_REL="data/supplementary_set" SMOKE_DATA_REL="data/training_smoke" IMAGE_NAME="cinc2026" @@ -91,6 +92,13 @@ evaluate_predictions_dev() { -o "$output_path/${DEMOGRAPHICS_FILE}" } +dataset_has_labels() { + local data_dir="$1" + local demographics_path="$data_dir/$DEMOGRAPHICS_FILE" + + [[ -f "$demographics_path" ]] && head -n 1 "$demographics_path" | grep -q "Cognitive_Impairment" +} + build_image() { docker_cli build -t "$IMAGE_NAME" . } @@ -104,7 +112,7 @@ train_full() { local full_data model_full local full_data_docker model_full_docker - full_data="$(get_absolute_path "$FULL_DATA_REL")" + full_data="$(get_absolute_path "$TRAIN_DATA_REL")" model_full="$(get_absolute_path ".")/${MODEL_FULL_REL}" full_data_docker="$(to_docker_path "$full_data")" model_full_docker="$(to_docker_path "$model_full")" @@ -137,26 +145,30 @@ train_smoke() { } run_full() { - local full_data model_full out_full - local full_data_docker model_full_docker out_full_docker + local run_data model_full out_full + local run_data_docker model_full_docker out_full_docker - full_data="$(get_absolute_path "$FULL_DATA_REL")" + run_data="$(get_absolute_path "$RUN_DATA_REL")" model_full="$(get_absolute_path "$MODEL_FULL_REL")" out_full="$(get_absolute_path ".")/${OUT_FULL_REL}" - full_data_docker="$(to_docker_path "$full_data")" + run_data_docker="$(to_docker_path "$run_data")" model_full_docker="$(to_docker_path "$model_full")" out_full_docker="$(to_docker_path "$out_full")" ensure_directory "$out_full" docker_cli run --rm \ - -v "${full_data_docker}:/challenge/holdout_data:ro" \ + -v "${run_data_docker}:/challenge/holdout_data:ro" \ -v "${model_full_docker}:/challenge/model:ro" \ -v "${out_full_docker}:/challenge/holdout_outputs" \ "$IMAGE_NAME" \ python run_model.py -d holdout_data -m model -o holdout_outputs -v - evaluate_predictions "$full_data" "$out_full" "full-dataset" + if dataset_has_labels "$run_data"; then + evaluate_predictions "$run_data" "$out_full" "run-dataset" + else + echo "Skipping evaluation for run dataset (labels not present in ${RUN_DATA_REL}/${DEMOGRAPHICS_FILE})." + fi } run_smoke() { @@ -183,12 +195,16 @@ run_smoke() { } eval_full() { - local full_data out_full + local run_data out_full - full_data="$(get_absolute_path "$FULL_DATA_REL")" + run_data="$(get_absolute_path "$RUN_DATA_REL")" out_full="$(get_absolute_path "$OUT_FULL_REL")" - evaluate_predictions "$full_data" "$out_full" "full-dataset" + if dataset_has_labels "$run_data"; then + evaluate_predictions "$run_data" "$out_full" "run-dataset" + else + echo "Skipping evaluation for run dataset (labels not present in ${RUN_DATA_REL}/${DEMOGRAPHICS_FILE})." + fi } eval_smoke() { From 922c1093ec97aad29dd6afbccad9e808848cd434 Mon Sep 17 00:00:00 2001 From: dcajal Date: Fri, 27 Mar 2026 12:52:56 +0100 Subject: [PATCH 22/38] Update run scripts to replace references from supplementary_set to test_set --- docs/04_run_script.md | 6 +++--- run.ps1 | 4 ++-- run.sh | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/04_run_script.md b/docs/04_run_script.md index 310bb5c..87eef6d 100644 --- a/docs/04_run_script.md +++ b/docs/04_run_script.md @@ -99,8 +99,8 @@ Guarda el modelo en `model/`. ./run.sh run ``` -Genera resultados en `outputs/` usando `data/supplementary_set/`. -Si el dataset no tiene etiquetas (como en `supplementary_set`), el script omite la evaluación automáticamente. +Genera resultados en `outputs/` usando `data/test_set/`. +Si el dataset no tiene etiquetas (como en `test_set`), el script omite la evaluación automáticamente. ### Evaluar predicciones existentes completas @@ -109,7 +109,7 @@ Si el dataset no tiene etiquetas (como en `supplementary_set`), el script omite ``` Reutiliza `outputs/demographics.csv` y muestra AUROC, AUPRC, Accuracy y F-measure sin volver a ejecutar inferencia. -Evalúa contra `data/supplementary_set/`. +Evalúa contra `data/test_set/`. Si no hay etiquetas en ese set, el script omite la evaluación automáticamente. ### Evaluar predicciones existentes del dataset smoke diff --git a/run.ps1 b/run.ps1 index b480a55..8d2141b 100644 --- a/run.ps1 +++ b/run.ps1 @@ -23,11 +23,11 @@ param( # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # IMPORTANTE: -# Si tu dataset no está en data/training_set o data/supplementary_set, +# Si tu dataset no está en data/training_set o data/test_set, # modifica estas rutas. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> $TRAIN_DATA_REL = "data/training_set" -$RUN_DATA_REL = "data/supplementary_set" +$RUN_DATA_REL = "data/test_set" $SMOKE_DATA_REL = "data/training_smoke" $IMAGE_NAME = "cinc2026" diff --git a/run.sh b/run.sh index 18b7e1e..dcabdbf 100644 --- a/run.sh +++ b/run.sh @@ -13,7 +13,7 @@ COMMAND="$1" # ============================================ TRAIN_DATA_REL="data/training_set" -RUN_DATA_REL="data/supplementary_set" +RUN_DATA_REL="data/test_set" SMOKE_DATA_REL="data/training_smoke" IMAGE_NAME="cinc2026" From ebe970c1d37eb5f530d993fa901cd4f1f17ac919 Mon Sep 17 00:00:00 2001 From: dcajal Date: Fri, 27 Mar 2026 13:08:56 +0100 Subject: [PATCH 23/38] Add optimization tracking documentation and improve training performance with caching and parallel processing --- docs/05_optimization_tracking.md | 128 ++++++++++++++++++++++++++ team_code.py | 153 +++++++++++++++++++------------ 2 files changed, 224 insertions(+), 57 deletions(-) create mode 100644 docs/05_optimization_tracking.md diff --git a/docs/05_optimization_tracking.md b/docs/05_optimization_tracking.md new file mode 100644 index 0000000..db352fa --- /dev/null +++ b/docs/05_optimization_tracking.md @@ -0,0 +1,128 @@ +# Seguimiento De Optimizaciones + +Este documento registra las optimizaciones de tiempo de entrenamiento aplicadas sobre el código de ejemplo original del PhysioNet Challenge en `team_code.py`. + +## Objetivo + +Mantener un registro claro de los cambios respecto a la base proporcionada por la organización para que el equipo pueda: + +- entender qué optimizaciones se probaron; +- medir su efecto sobre el flujo smoke; +- identificar qué cambios merece la pena conservar; +- revertir cambios concretos si la submission se comporta distinto en el entorno del Challenge. + +## Línea Base + +- Fuente de la línea base: implementación de ejemplo proporcionada por la organización en `team_code.py`. +- Tiempo observado de entrenamiento smoke con `./run.sh train-dev`: alrededor de 22 segundos. +- Comportamiento base: extracción secuencial de features, lecturas repetidas de CSV, recarga repetida de reglas de renombrado de canales y carga no utilizada de anotaciones humanas durante entrenamiento. + +## Cambios Aplicados + +### 1. Eliminación de la carga no utilizada de anotaciones humanas en entrenamiento + +Cambio: + +- Se eliminó la carga de `human_annotations` dentro de `train_model`. +- Se dejó intacta la función auxiliar `extract_human_annotations_features`. + +Motivo: + +- El vector final de entrenamiento solo concatenaba features demográficas, fisiológicas y algorítmicas. +- Las features de anotaciones humanas se calculaban, pero nunca se incluían en el `np.hstack(...)` que se pasaba al clasificador. + +Efecto observado: + +- El tiempo de entrenamiento smoke pasó de unos 22.0 s a 21.891 s. +- Conclusión: la limpieza es correcta a nivel lógico, pero su impacto en tiempo es despreciable en el dataset smoke. + +Riesgo: + +- Bajo. Solo elimina trabajo muerto. + +### 2. Caché de reglas de renombrado de canales + +Cambio: + +- Se añadió una caché en proceso para las reglas de renombrado cargadas desde `channel_table.csv`. +- Se sustituyeron las llamadas repetidas a `load_rename_rules(os.path.abspath(csv_path))` por una consulta a la caché. + +Motivo: + +- `extract_physiological_features` estaba cargando y parseando el mismo CSV para cada registro. + +Efecto observado: + +- El tiempo smoke medido en la siguiente ejecución fue 22.040 s. +- Conclusión: la optimización es correcta, pero no ataca un cuello de botella relevante en smoke. + +Riesgo: + +- Bajo. El comportamiento no cambia salvo por reutilizar reglas ya parseadas. + +### 3. Caché de demographics y etiquetas para entrenamiento + +Cambio: + +- Se añadió una lectura única de `demographics.csv` al inicio de `train_model`. +- Se construyeron: + - una caché de demographics indexada por `(patient_id, session_id)`; + - una caché de diagnósticos indexada por `patient_id`. +- Se reemplazaron las llamadas por registro a `load_demographics(...)` y `load_diagnoses(...)` durante entrenamiento. + +Motivo: + +- El bucle original de entrenamiento releía el mismo CSV para cada registro. + +Efecto observado: + +- El tiempo smoke bajó a 20.837 s. +- Conclusión: es una mejora real, aunque moderada. + +Riesgo: + +- Bajo a medio. +- Asume que las etiquetas de entrenamiento son estables a nivel de paciente cuando se cachean por `patient_id`, igual que hacía el comportamiento original de `load_diagnoses(...)`. + +### 4. Paralelización de la extracción de features en entrenamiento + +Cambio: + +- Se añadió procesamiento paralelo por registro con `ThreadPoolExecutor` dentro de `train_model`. +- Se movió la lógica de extracción por registro a `process_training_record(...)`. +- Se limitó el número de workers con: + +```python +MAX_TRAIN_WORKERS = max(1, min(4, os.cpu_count() or 1)) +``` + +Motivo: + +- Cada registro de entrenamiento se procesa de forma independiente. +- El pipeline mezcla lecturas de archivos EDF y trabajo con NumPy, así que un pool pequeño de hilos puede reducir el tiempo total. + +Efecto observado: + +- El tiempo smoke bajó a 9.578 s en la primera ejecución tras paralelizar. +- Las ejecuciones de seguimiento midieron 9.762 s y 9.655 s. +- Conclusión: esta es la optimización dominante. + +Riesgo: + +- Medio. +- El acceso paralelo a archivos puede comportarse distinto en discos más lentos o en una infraestructura más limitada del Challenge. + +## Plan De Rollback + +Si la submission se comporta distinto en el entorno del Challenge, revertir en este orden: + +1. Eliminar la extracción con hilos y restaurar el bucle secuencial original en `train_model`. +2. Eliminar las cachés de metadata de entrenamiento y volver a `load_demographics(...)` / `load_diagnoses(...)`. +3. Eliminar la caché de reglas de renombrado y volver a las llamadas directas a `load_rename_rules(...)`. +4. Rehabilitar la carga de anotaciones humanas solo si el vector de entrenamiento se modifica explícitamente para usar esas features. + +Este orden de rollback elimina primero la optimización de mayor riesgo y deja para el final los cambios de comportamiento más pequeños. + +## Archivos Modificados + +- `team_code.py` \ No newline at end of file diff --git a/team_code.py b/team_code.py index 2b18023..2185a52 100644 --- a/team_code.py +++ b/team_code.py @@ -14,7 +14,9 @@ import os import atexit import builtins +import pandas as pd import re +from concurrent.futures import ThreadPoolExecutor from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor import sys from tqdm import tqdm @@ -37,6 +39,77 @@ ORIGINAL_PRINT = builtins.print PRINT_FILTER_ACTIVE = False RUN_PROGRESS_LINE_RE = re.compile(r'^-\s+\d+/\d+:\s') +RENAME_RULES_CACHE = {} +MAX_TRAIN_WORKERS = max(1, min(4, os.cpu_count() or 1)) + + +def build_training_metadata_cache(patient_data_file): + metadata = pd.read_csv(patient_data_file) + demographics_cache = {} + diagnosis_cache = {} + + for row in metadata.to_dict('records'): + patient_id = row[HEADERS['bids_folder']] + session_id = row[HEADERS['session_id']] + demographics_cache[(patient_id, session_id)] = row + diagnosis_cache[patient_id] = load_label(row) + + return demographics_cache, diagnosis_cache + + +def get_rename_rules(csv_path): + normalized_csv_path = os.path.abspath(csv_path) + rename_rules = RENAME_RULES_CACHE.get(normalized_csv_path) + if rename_rules is None: + rename_rules = load_rename_rules(normalized_csv_path) + RENAME_RULES_CACHE[normalized_csv_path] = rename_rules + return rename_rules + + +def process_training_record(record, data_folder, demographics_cache, diagnosis_cache, csv_path): + patient_id = record[HEADERS['bids_folder']] + site_id = record[HEADERS['site_id']] + session_id = record[HEADERS['session_id']] + + try: + patient_data = demographics_cache.get((patient_id, session_id), {}) + demographic_features = extract_demographic_features(patient_data) + + physiological_data_file = os.path.join( + data_folder, + PHYSIOLOGICAL_DATA_SUBFOLDER, + site_id, + f"{patient_id}_ses-{session_id}.edf" + ) + if not os.path.exists(physiological_data_file): + return patient_id, None, None, f"Missing physiological data for {patient_id}. Skipping..." + + physiological_data, physiological_fs = load_signal_data(physiological_data_file) + physiological_features = extract_physiological_features( + physiological_data, + physiological_fs, + csv_path=csv_path + ) + + algorithmic_annotations_file = os.path.join( + data_folder, + ALGORITHMIC_ANNOTATIONS_SUBFOLDER, + site_id, + f"{patient_id}_ses-{session_id}_caisr_annotations.edf" + ) + algorithmic_annotations, algorithmic_fs = load_signal_data(algorithmic_annotations_file) + algorithmic_features = extract_algorithmic_annotations_features(algorithmic_annotations) + + label = diagnosis_cache.get(patient_id) + + if label == 0 or label == 1: + feature_vector = np.hstack([demographic_features, physiological_features, algorithmic_features]) + return patient_id, feature_vector, label, None + + return patient_id, None, None, f"Invalid label for {patient_id}. Skipping..." + + except Exception as e: + return patient_id, None, None, f"Error processing {patient_id}: {e}" def _close_run_model_pbar(): @@ -89,6 +162,7 @@ def train_model(data_folder, model_folder, verbose, csv_path=DEFAULT_CSV_PATH): patient_data_file = os.path.join(data_folder, DEMOGRAPHICS_FILE) patient_metadata_list = find_patients(patient_data_file) + demographics_cache, diagnosis_cache = build_training_metadata_cache(patient_data_file) num_records = len(patient_metadata_list) if num_records == 0: @@ -98,69 +172,34 @@ def train_model(data_folder, model_folder, verbose, csv_path=DEFAULT_CSV_PATH): if verbose: print('Extracting features and labels from the data...') - # Iterate over the records to extract the features and labels. features = list() labels = list() - - pbar = tqdm(range(num_records), desc="Extracting Features", unit="record", disable=not verbose) - for i in pbar: - try: - # Extract identifiers for this specific record - record = patient_metadata_list[i] - patient_id = record[HEADERS['bids_folder']] - site_id = record[HEADERS['site_id']] - session_id = record[HEADERS['session_id']] + with ThreadPoolExecutor(max_workers=MAX_TRAIN_WORKERS) as executor: + results = executor.map( + lambda record: process_training_record( + record, + data_folder, + demographics_cache, + diagnosis_cache, + csv_path + ), + patient_metadata_list + ) + + pbar = tqdm(results, total=num_records, desc="Extracting Features", unit="record", disable=not verbose) + for patient_id, feature_vector, label, message in pbar: if verbose: pbar.set_postfix({"patient": patient_id}) - # Load the patient data. - patient_data_file = os.path.join(data_folder, DEMOGRAPHICS_FILE) - patient_data = load_demographics(patient_data_file, patient_id, session_id) - demographic_features = extract_demographic_features(patient_data) - - # Load signal data. - - # Load the physiological signal. - physiological_data_file = os.path.join(data_folder, PHYSIOLOGICAL_DATA_SUBFOLDER, site_id, f"{patient_id}_ses-{session_id}.edf") - # --- Check if the file actually exists before proceeding --- - if not os.path.exists(physiological_data_file): - if verbose: - print(f" ! Missing physiological data for {patient_id}. Skipping...") - continue # skip record - physiological_data, physiological_fs = load_signal_data(physiological_data_file) - physiological_features = extract_physiological_features(physiological_data, physiological_fs, csv_path=csv_path) # This function can rename, re-reference, resample, etc. the signal data. - - # Load the algorithmic annotations. - algorithmic_annotations_file = os.path.join(data_folder, ALGORITHMIC_ANNOTATIONS_SUBFOLDER, site_id, f"{patient_id}_ses-{session_id}_caisr_annotations.edf") - algorithmic_annotations, algorithmic_fs = load_signal_data(algorithmic_annotations_file) - algorithmic_features = extract_algorithmic_annotations_features(algorithmic_annotations) - - # Load the human annotations; these data will not be available in the hidden validation and test sets. - human_annotations_file = os.path.join(data_folder, HUMAN_ANNOTATIONS_SUBFOLDER, site_id, f"{patient_id}_ses-{session_id}_expert_annotations.edf") - human_annotations, human_fs = load_signal_data(human_annotations_file) - human_features = extract_human_annotations_features(human_annotations) - - # Load the diagnoses; these data will not be available in the hidden validation and test sets. - diagnosis_file = os.path.join(data_folder, DEMOGRAPHICS_FILE) - label = load_diagnoses(diagnosis_file, patient_id) - - # Store the features and labels, but - # the human annotations are not available on the hidden validation and test sets, but you - # may want to consider how to use them for training. - if label == 0 or label == 1: - features.append(np.hstack([demographic_features, physiological_features, algorithmic_features])) - labels.append(label) - - if 'physiological_data' in locals(): del physiological_data - if 'algorithmic_annotations' in locals(): del algorithmic_annotations - - except Exception as e: - # If an error occurs (e.g., a record is corrupted), log it and move to the next - tqdm.write(f" !!! Error processing record {i+1} ({patient_id}): {e}") - continue + if message is not None: + tqdm.write(f" ! {message}") + continue + + features.append(feature_vector) + labels.append(label) - pbar.close() + pbar.close() features = np.asarray(features, dtype=np.float32) labels = np.asarray(labels, dtype=bool) @@ -332,7 +371,7 @@ def extract_physiological_features(physiological_data, physiological_fs, csv_pat # Step 1: Load rules and standardize names # Note: Use script-relative path or absolute path for robustness - rename_rules = load_rename_rules(os.path.abspath(csv_path)) + rename_rules = get_rename_rules(csv_path) rename_map, cols_to_drop = standardize_channel_names_rename_only(original_labels, rename_rules) # Step 2: Apply renaming to BOTH signals and their corresponding FS From 8f121b784bca971fa6b8d105d7c9aa3cd526e817 Mon Sep 17 00:00:00 2001 From: rolopu1 <718694@unizar.es> Date: Fri, 27 Mar 2026 13:52:07 +0100 Subject: [PATCH 24/38] Refactor respiratory processing to improve signal handling and feature extraction, including updates to sampling frequency and NaN handling. --- src/eeg_processing.py | 226 +++++++++++++++---------------------- src/lib/Resp_features.py | 92 ++++++++++++++- src/resp_processing.py | 236 +++++++++++++++++++++------------------ 3 files changed, 306 insertions(+), 248 deletions(-) diff --git a/src/eeg_processing.py b/src/eeg_processing.py index 127a32e..760b474 100644 --- a/src/eeg_processing.py +++ b/src/eeg_processing.py @@ -29,161 +29,113 @@ El módulo depende de `numpy`, `pandas`, `matplotlib`, `plotly` y de las utilidades definidas en `lib/helper_code` y `lib/EEG_functions`. """ - -import numpy as np -import pandas as pd -import sys +import sys import os -import matplotlib.pyplot as plt -import plotly.express as px -import plotly.graph_objects as go -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -import lib.helper_code as helper_code +import pandas as pd +import numpy as np +import helper_code as helper_code import lib.EEG_functions as EEG_functions -def MetricasHospitlal(hospital): - - print(f"Procesando hospital: {hospital}") - if hospital == 'I0002' or hospital == 'I0006' or hospital == "S0001": - datapath = 'data/training_set/Physiological_data/'+hospital - else: - datapath = 'data/supplementary_set/Physiological_data/'+hospital - - channels = pd.read_csv("notebooks/channel_table.csv") - selectEEG = channels[channels['Category'].isin(['eeg'])] +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) - demographics = pd.read_csv(os.path.join('C:/BSICoS/CincChallenge2026/CincChallenge_2026/data/training_set', "demographics.csv")) +def processEEG(physiological_data, physiological_fs, csv_path): - # Datos = pd.DataFrame(columns=['File', 'Channel', 'Sampling_Frequency', 'Duration_sec']) - lista_dir = os.listdir(datapath) - results = [] + channels = pd.read_csv(csv_path) + selectEEG = channels[channels['Category'].isin(['eeg'])] - for file in lista_dir: - # Cargar el archivo (sustituye por tu ruta real) - edf = helper_code.edfio.read_edf(os.path.join(datapath, file)) + for label in original_labels: + fs = physiological_fs[label] - id = file[9:-10] # Asumiendo que el ID es el nombre del archivo sin la extensión - - selEEG = [] - labels = [] data = [] + original_labels = list(physiological_data.keys()) # Listar canales para identificar los de interés (ej: C3-M2, O1-M2) HayEEG = False - for i, sig in enumerate(edf.signals): - # print(f"[{i}] {sig.label}") - # print length fs and duration - # print(f"Length: {len(sig.data)}, Sampling Frequency: {sig.sampling_frequency} Hz, Duration: {len(sig.data)/sig.sampling_frequency:.2f} seconds") + for i, label in enumerate(original_labels): for index in selectEEG.index: - if sig.label.lower() in selectEEG['Channel_Names'][index].lower(): - print(f"Canal seleccionado: {sig.label}") - selEEG.append([i,sig]) - labels.append(sig.label) + if label.lower() in selectEEG['Channel_Names'][index].lower(): + print(f"Canal seleccionado: {label}") + labels.append(label) HayEEG = True break - # for i in range(len(edf.signals)): - # print(f"Longitud: {edf.signals[i].data.shape}, Canal: {edf.signals[i].label}, Frecuencia de muestreo: {edf.signals[i].sampling_frequency} Hz, Duración: {len(edf.signals[i].data)/edf.signals[i].sampling_frequency:.2f} segundos") + + results = [] + labels2 = [] + if HayEEG: + Bipolar = pd.DataFrame() + if all(label in labels for label in ["F3", "F4", "M1", "M2"]): + Bipolar['F3-M2'] = physiological_data["F3"] - physiological_data["M2"] + Bipolar['F4-M1'] = physiological_data["F4"] - physiological_data["M1"] + labels2.append('F3-M2') + labels2.append('F4-M1') + if all(label in labels for label in ["C3", "C4", "M1", "M2"]): + Bipolar['C3-M2'] = physiological_data["C3"] - physiological_data["M2"] + Bipolar['C4-M1'] = physiological_data["C4"] - physiological_data["M1"] + labels2.append('C3-M2') + labels2.append('C4-M1') + if all(label in labels for label in ["O2", "O1", "M1", "M2"]): + Bipolar['O2-M2'] = physiological_data["O1"] - physiological_data["M2"] + Bipolar['O1-M1'] = physiological_data["O2"] - physiological_data["M1"] + labels2.append('O1-M1') + labels2.append('O2-M2') + # print(f"Archivo {file} tiene ECG, RESP y EEG. Se procesará con canales bipolares.") - if HayEEG: - - Bipolar = pd.DataFrame() - if all(label in labels for label in ["F3", "F4", "M1", "M2"]): - Bipolar['F3-M2'] = edf.signals[edf.labels.index("F3")].data - edf.signals[edf.labels.index("M2")].data - Bipolar['F4-M1'] = edf.signals[edf.labels.index("F4")].data - edf.signals[edf.labels.index("M1")].data - labels2 = ['F3-M2', 'F4-M1'] - if all(label in labels for label in ["C3", "C4", "M1", "M2"]): - Bipolar['C3-M2'] = edf.signals[edf.labels.index("C3")].data - edf.signals[edf.labels.index("M2")].data - Bipolar['C4-M1'] = edf.signals[edf.labels.index("C4")].data - edf.signals[edf.labels.index("M1")].data - labels2.append('C3-M2') - labels2.append('C4-M1') - if all(label in labels for label in ["O2", "O1", "M1", "M2"]): - Bipolar['O2-M2'] = edf.signals[edf.labels.index("O1")].data - edf.signals[edf.labels.index("M2")].data - Bipolar['O1-M1'] = edf.signals[edf.labels.index("O2")].data - edf.signals[edf.labels.index("M1")].data - labels2.append('O1-M1') - labels2.append('O2-M2') - # print(f"Archivo {file} tiene ECG, RESP y EEG. Se procesará con canales bipolares.") - - if not Bipolar.empty: - labels = [] - for col in Bipolar.columns: - # print(f"Archivo: {file}, Canal: {col}, Frecuencia de muestreo: {sig.sampling_frequency} Hz, Duración: {len(Bipolar[col])/sig.sampling_frequency:.2f} segundos") - fs = edf.signals[edf.labels.index("M2")].sampling_frequency # Asumimos que todos los canales tienen la misma frecuencia de muestreo - time = np.linspace(0, len(Bipolar[col]) / fs, len(Bipolar[col])) - fil = EEG_functions.butter_bandpass_filter(Bipolar[col], lowcut=0.3, highcut=35, fs=fs, order=4) - norm = (fil-np.mean(fil))/np.std(fil) - - data.append(norm) # Restar la media para centrar la señal - labels.append(col) - # columns = Bipolar.columns.tolist() + if not Bipolar.empty: + labels = [] + for col in Bipolar.columns: + # print(f"Archivo: {file}, Canal: {col}, Frecuencia de muestreo: {sig.sampling_frequency} Hz, Duración: {len(Bipolar[col])/sig.sampling_frequency:.2f} segundos") + fs = physiological_data["M2"].sampling_frequency # Asumimos que todos los canales tienen la misma frecuencia de muestreo + fil = EEG_functions.butter_bandpass_filter(Bipolar[col], lowcut=0.3, highcut=35, fs=fs, order=4) + norm = (fil-np.mean(fil))/np.std(fil) + + data.append(norm) # Restar la media para centrar la señal + labels.append(col) + # columns = Bipolar.columns.tolist() + else: + labels = [] + for l in labels: + # print(f"Archivo: {file}, Canal: {sig.label}, Frecuencia de muestreo: {sig.sampling_frequency} Hz, Duración: {len(sig.data)/sig.sampling_frequency:.2f} segundos") + fs = physiological_fs[l] + fil = EEG_functions.butter_bandpass_filter(physiological_data[l], lowcut=0.3, highcut=35, fs=fs, order=4) + norm = (fil-np.mean(fil))/np.std(fil) + labels.append(l) + data.append(norm) # Restar la media para centrar la señal + + # columns = [selEEG[i][1].label for i in range(len(selEEG))] + + + for i, elec in enumerate(labels): + epoch_length = 30 # Duración de cada época en segundos + if Bipolar.empty: + fs = physiological_fs[l] else: - for i, (idx, sig) in enumerate(selEEG): - # print(f"Archivo: {file}, Canal: {sig.label}, Frecuencia de muestreo: {sig.sampling_frequency} Hz, Duración: {len(sig.data)/sig.sampling_frequency:.2f} segundos") - fs = sig.sampling_frequency - time = np.linspace(0, len(sig.data) / fs, len(sig.data)) - fil = EEG_functions.butter_bandpass_filter(sig.data, lowcut=0.3, highcut=35, fs=fs, order=4) - norm = (fil-np.mean(fil))/np.std(fil) - labels.append(sig.label) - data.append(norm) # Restar la media para centrar la señal - - # columns = [selEEG[i][1].label for i in range(len(selEEG))] - + fs = physiological_fs['M1'] + + if fs != 200: + # print(f"Warning: Sampling frequency for channel {elec} in file {file} is {fs} Hz, expected 200 Hz. Check the data.") + duration = len(data[i]) / fs + time_original = np.linspace(0, duration, len(data[i])) - demographics = demographics[demographics['BDSPPatientID'] == int(id)] - print(demographics) + num_samples_target = int(duration * 200 ) + time_target = np.linspace(0, duration, num_samples_target) + data[i] = np.interp(time_target, time_original, data[i]) + fs = 200 # Update fs to the target sampling frequency after resampling - for i, elec in enumerate(labels): - epoch_length = 30 # Duración de cada época en segundos - if Bipolar.empty: - fs = edf.signals[edf.labels.index(labels[i])].sampling_frequency - else: - fs = edf.signals[edf.labels.index('M1')].sampling_frequency - - if fs != 200: - # print(f"Warning: Sampling frequency for channel {elec} in file {file} is {fs} Hz, expected 200 Hz. Check the data.") - duration = len(data[i]) / fs - time_original = np.linspace(0, duration, len(data[i])) - - num_samples_target = int(duration * 200 ) - time_target = np.linspace(0, duration, num_samples_target) - data[i] = np.interp(time_target, time_original, data[i]) - fs = 200 # Update fs to the target sampling frequency after resampling - - epochs = EEG_functions.create_epochs(data[i], fs, epoch_duration=epoch_length) + epochs = EEG_functions.create_epochs(data[i], fs, epoch_duration=epoch_length) - band_powers, complexities = EEG_functions.extract_band_powers(epochs, fs, win_len=15) - print(f"Band powers for file {file}:") - - band_powers = band_powers.iloc[60:] # Eliminar las primeras 60 épocas (30 min) para evitar el tiempo despierto al inicio de la grabación - print(band_powers.head()) - - - # # Convertir de formato "ancho" a "largo" para Plotly - # df_melted = band_powers.melt(var_name='Banda', value_name='Potencia') - # # Creamos el boxplot - # fig = px.box(df_melted, x='Banda', y='Potencia', - # color='Banda', - # points="outliers", # Para ver si hay épocas muy extrañas - # title=f"{id} - {elec} - {demographics.Cognitive_Impairment.values[0]}", - # log_y=True) # Usamos escala logarítmica porque Delta suele ser mucho más potente que Beta - - # fig.update_layout(template="plotly_white", showlegend=False) - # # fig.write_html(f"graphs/BandasPersona/{id}_{elec}_{demographics.Cognitive_Impairment.values[0]}.html") # Guardar como HTML para visualización interactiva - # fig.show() - - # Ejecución - patient_summar = EEG_functions.get_patient_profile(band_powers) - # print(f"Resumen del perfil del paciente {id} - {elec}:") - # print(patient_summar) - d = complexities.iloc[:].std().to_dict() - results.append({ - 'File': file, - 'Channel': elec, - 'Patient_ID': id, - **d, - **patient_summar - }) + band_powers, complexities = EEG_functions.extract_band_powers(epochs, fs, win_len=15) + band_powers = band_powers.iloc[60:] # Eliminar las primeras 60 épocas (30 min) para evitar el tiempo despierto al inicio de la grabación + + + # Ejecución + patient_summar = EEG_functions.get_patient_profile(band_powers) + + d = complexities.iloc[:].std().to_dict() + results.append({ + 'Channel': elec, + **d, + **patient_summar + }) df_results = pd.DataFrame(results) - print(df_results.head()) - return df_results - # df_results.to_csv(f"results_summaryEEG_{hospital}.csv", index=False) \ No newline at end of file + return df_results \ No newline at end of file diff --git a/src/lib/Resp_features.py b/src/lib/Resp_features.py index 25c4706..553c9fa 100644 --- a/src/lib/Resp_features.py +++ b/src/lib/Resp_features.py @@ -33,27 +33,109 @@ def plot_resp(Data, subjet = 1, DownPrinting = 2): def peakedness_application(Data, stage, plotflag = False, subjet = 1): # print("Compute BR") - fs = 100 + fs = 25 Setup = {} Setup["K"] = 5 Setup["DT"] = 5 Setup["Ts"] = 60 #interval length of Welch periodograms (s) Setup["Tm"] = 20 #interval length of subintervals for Welch periodograms (s) # Setup["d"] = 0.1 #interval length of subintervals for Welch periodograms (s) - Setup["Omega_r"] = np.array([5, 20])/60 #respiratory rate range in Hz + Setup["Omega_r"] = np.array([5, 25])/60 #respiratory rate range in Hz Setup["plotflag"] = plotflag - Setup["Nfft"] = np.power(2,14) + Setup["Nfft"] = np.power(2,13) tsBR = np.arange(0,Data.shape[0]/fs,1/fs) if tsBR.shape[0] != Data.shape[0]: # print(f"tsBR.shape[0]: {tsBR.shape[0]}, Data.shape[0]: {Data.shape[0]}") tsBR = np.arange(0,Data.shape[0]/fs,1/fs)[:Data.shape[0]] - hat_Br, Sk_Br, t_aver = peakednessCost(Data, tsBR, fs, Setup, title = stage, storeGraph = False, subjet = subjet) + hat_Br, Sk_Br, t_aver, used = peakednessCost(Data, tsBR, fs, Setup, title = stage, storeGraph = False, subjet = subjet) # print(f"hat_Br: {hat_Br}, Sk_Br: {Sk_Br}, bar_Br: {bar_Br}, t_aver_Br: {t_aver_Br}, f_Br: {f_Br}, used_Br: {used_Br}") # print(hat_Br) - return hat_Br, Sk_Br, t_aver + return hat_Br, Sk_Br, t_aver, used + +def ODI_application(data, fs, plotflag=True, subjet=1): + """Detecta desaturaciones de más del 3 % en la señal de saturación de + oxígeno (SpO2) y devuelve estadísticas básicas de los eventos. + + El índice de desaturación de oxígeno (ODI) se define como el número de + episodios en los que la saturación cae al menos un 3 % respecto a una + línea de base móvil, normalizado por hora de grabación. Aquí se calcula + una línea base mediante la mediana móvil de 60 segundos y se agrupan + los índices consecutivos que cumplen el criterio en eventos únicos. + + Args: + data (array-like): valores de SpO2 (0‑100). + fs (float): frecuencia de muestreo en Hz. + plotflag (bool): si True, dibuja la señal y marca los eventos. + subjet (int): identificador de sujeto (utilizado en títulos de gráficas). + + Returns: + tuple: + * odi_mean (float): número de desaturaciones normalizado por hora. + * odi_std (float): desviación estándar de las magnitudes de caída + entre eventos (en porcentaje). + """ + # convertir a serie para comodidad + sp = pd.Series(data) + if len(sp) == 0 or fs <= 0: + return 0.0, 0.0 + + # base móvil de 60 segundos (median para ser robusto). ventana en muestras + window = int(fs * 60) + if window < 1: + window = 1 + baseline = sp.rolling(window, min_periods=1, center=True).median() + + # diferencia de base menos señal; buscamos caídas >=3 + diff = baseline - sp + mask = diff >= 3 + + # juntar índices contiguos en eventos + events = [] # lista de (start_idx, end_idx) + in_event = False + for idx, flag in mask.items(): + if flag and not in_event: + start = idx + in_event = True + elif not flag and in_event: + end = prev_idx + events.append((start, end)) + in_event = False + prev_idx = idx + if in_event: + events.append((start, prev_idx)) + + num_events = len(events) + duration_hours = len(sp) / fs / 3600.0 + odi_mean = num_events / duration_hours if duration_hours > 0 else 0.0 + + # calcular magnitudes de caída en cada evento (tomando el valor más bajo) + magnitudes = [] + for start, end in events: + mag = diff.loc[start:end].max() + magnitudes.append(mag) + odi_deepness = np.mean(magnitudes) if magnitudes else 0.0 + + if plotflag: + import matplotlib.pyplot as plt + times = np.arange(len(sp)) / fs / 60.0 # minutos + plt.figure(figsize=(10, 4)) + plt.plot(times, sp.values, label='SpO2') + plt.plot(times, baseline.values, label='Baseline (60s med)') + for (start, end) in events: + t0 = start / fs / 60.0 + t1 = end / fs / 60.0 + plt.axvspan(t0, t1, color='red', alpha=0.3) + plt.xlabel('Tiempo (min)') + plt.ylabel('SpO2 (%)') + plt.title(f'Sujeto {subjet} - ODI detectado: {odi_mean:.2f} eventos/h') + plt.legend() + plt.tight_layout() + plt.show() + + return odi_mean, odi_deepness # Butterworth low-pass filter def lowpass_filter(signal, fs, cutoff=2.0, order=4): diff --git a/src/resp_processing.py b/src/resp_processing.py index dfe6ec2..d8fa8bf 100644 --- a/src/resp_processing.py +++ b/src/resp_processing.py @@ -1,114 +1,138 @@ -import numpy as np -import pandas as pd -import os -import matplotlib.pyplot as plt -import plotly.express as px +import lib.Resp_features as Resp_features import sys - -import plotly.graph_objects as go -from plotly.subplots import make_subplots +import os +import pandas as pd +import numpy as np sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -import lib.helper_code as helper_code -import lib.EEG_functions as EEG_functions -import lib.Resp_features as Resp_features -for hospital in ['I0006',"S0001",'I0004','I0007']:#'I0002', - print(f"Procesando hospital: {hospital}") +def processResp(physiological_data, physiological_fs, csv_path): - if hospital == 'I0002' or hospital == 'I0006' or hospital == "S0001": - datapath = 'data/training_set/Physiological_data/'+hospital - else: - datapath = 'data/supplementary_set/Physiological_data/'+hospital - - channels = pd.read_csv("notebooks/channel_table.csv") + channels = pd.read_csv(csv_path) selectResp = channels[channels['Category'].isin(['resp'])] - demographics = pd.read_csv(os.path.join('C:/BSICoS/CincChallenge2026/CincChallenge_2026/data/training_set', "demographics.csv")) - - # Datos = pd.DataFrame(columns=['File', 'Channel', 'Sampling_Frequency', 'Duration_sec']) - lista_dir = os.listdir(datapath) - results = [] - - for file in lista_dir: - # Cargar el archivo (sustituye por tu ruta real) - edf = helper_code.edfio.read_edf(os.path.join(datapath, file)) - - id = file[9:-10] # Asumiendo que el ID es el nombre del archivo sin la extensión - - selResp = [] - labels = [] - data = [] - - HayResp = False - for i, sig in enumerate(edf.signals): - for index in selectResp.index: - if sig.label.lower() in selectResp['Channel_Names'][index].lower(): - print(f"Canal seleccionado: {sig.label}") - selResp.append([i,sig]) - labels.append(sig.label) - HayResp = True - # plot en plotly la señal - go.Figure(data=go.Scattergl(x=np.arange(len(sig.data))/sig.sampling_frequency, y=sig.data, mode='lines', name=sig.label)).update_layout(title=f"Señal de {sig.label} - Archivo: {file}", xaxis_title="Tiempo (s)", yaxis_title="Amplitud").show() - # px.line(x=np.arange(len(sig.data))/sig.sampling_frequency, y=sig.data, title=f"Señal de {sig.label} - Archivo: {file}").show() - break - - if HayResp: - for i, (idx, sig) in enumerate(selResp): - print(f"Archivo: {file}, Canal: {sig.label}, Frecuencia de muestreo: {sig.sampling_frequency} Hz, Duración: {len(sig.data)/sig.sampling_frequency:.2f} segundos") - fs = sig.sampling_frequency - - if fs != 25: - duration = len(sig.data) / fs - time_original = np.linspace(0, duration, len(sig.data)) - num_samples_target = int(duration * 25 ) - time_target = np.linspace(0, duration, num_samples_target) - data = np.interp(time_target, time_original, sig.data) - fs = 25 # Update fs to the target sampling frequency after resampling + resultados = {} + UsedFlow = 0 + UsedChest = 0 + UsedAbdomen = 0 + UsedSpO2 = 0 + UsedNasal = 0 + UsedCepap = 0 + + data = [] + original_labels = list(physiological_data.keys()) + + for label in original_labels: + fs = physiological_fs[label] + sig = physiological_data[label] + if fs != 25: + duration = len(sig) / fs + time_original = np.linspace(0, duration, len(sig)) + num_samples_target = int(duration * 25 ) + time_target = np.linspace(0, duration, num_samples_target) + data = np.interp(time_target, time_original, sig) + fs = 25 # Update fs to the target sampling frequency after resampling + else: + data = sig + + # Check nan in sig.data + if np.isnan(sig).any(): + print(f"Warning: NaN values found in signal data for {label}. Filling NaNs with zeros.") + data = np.nan_to_num(data) + + name = "" + if label.lower() not in selectResp['Channel_Names'][34].lower(): + d = Resp_features.peakedness_application(data, stage=label, plotflag = False, subjet =label) + if label.lower() in selectResp['Channel_Names'][28].lower(): + name = "Chest" + # EFFORT RESPIRATORY Chest + elif label.lower() in selectResp['Channel_Names'][29].lower(): + # EFFORT RESPIRATORY Abdomen + name = "Abdomen" + elif label.lower() in selectResp['Channel_Names'][30].lower(): + # RESPIRATORY NASAL + name = "Nasal" + elif label.lower() in selectResp['Channel_Names'][31].lower(): + # RESPIRATORY FLOW + name = "Flow" + elif label.lower() in selectResp['Channel_Names'][32].lower(): + # CEPAP + if np.all(data == 0) or np.std(data) < 5: + print(f"Warning: All values in the signal data for {label} are zero. Skipping feature extraction for this channel.") else: - data = sig.data - time_new = np.linspace(0, len(sig.data) / fs, len(sig.data)) - - # Check nan in sig.data - if np.isnan(sig.data).any(): - print(f"Warning: NaN values found in signal data for {sig.label}. Filling NaNs with zeros.") - data = np.nan_to_num(data) - - # if sig.label not in ["SpO2", "SaO2", "OSAT", "O2SAT", "O2 SAT", "O2-SAT", "O2-SATURATION"]: - # fil = EEG_functions.butter_bandpass_filter(data, lowcut=0.01, highcut=4, fs=fs, order=4) - # # norm = (fil-np.mean(fil))/np.std(fil) - # data.append(fil) # Restar la media para centrar la señal - - if sig.label.lower() in selectResp['Channel_Names'][28].lower() or sig.label.lower() in selectResp['Channel_Names'][29].lower(): - # EFFORT RESPIRATORY - elif sig.label.lower() in selectResp['Channel_Names'][30].lower() or sig.label.lower() in selectResp['Channel_Names'][31].lower(): - # RESPIRATORY Flujo - fil = EEG_functions.butter_bandpass_filter(data, lowcut=0.01, highcut=4, fs=fs, order=4) - Resp_features.peakedness_application(fil, stage=sig.label, plotflag = True, subjet =1) - elif sig.label.lower() in selectResp['Channel_Names'][32].lower() or sig.label.lower() in selectResp['Channel_Names'][33].lower(): - # CEPAP - elif sig.label.lower() in selectResp['Channel_Names'][34].lower(): - #O2 SATURATION - - - - # time_dt = pd.to_datetime(time_new, unit='s') - # # Plot raw and filtered signals - # fig = make_subplots(specs=[[{"secondary_y": True}]]) - # fig.add_trace(go.Scattergl(x=time_dt[::10], y=data[::10], name=sig.label, mode='lines'),secondary_y=False,row=1, col=1) - # fig.add_trace(go.Scattergl(x=time_dt[::10], y=fil[::10], name=f"Normalized {sig.label}", mode='lines'), secondary_y=True,row=1, col=1) - # fig.update_yaxes(title_text="Amplitud Original (uV)", secondary_y=False) - # fig.update_yaxes(title_text="Valor Normalizado (Z-score)", secondary_y=True) - # # update x axis to make time format - # fig.update_xaxes( - # tickformat="%H:%M:%S", # Formato de hora:minuto:segundo - # row=1, col=1 - # ) - # fig.show() - - # Plot spectrogram of raw and filtered signals - # fig = make_subplots(specs=[[{"secondary_y": True}]]) - # fig.add_trace(go.Scattergl(x=time_dt[::10], y=data[::10], name=sig.label, mode='lines'),secondary_y=False,row=1, col=1) - # fig.add_trace(go.Scattergl(x=time_dt[::10], y=fil[::10], name=f"Normalized {sig.label}", mode='lines'), secondary_y=True,row=1, col=1) - # fig.update_yaxes(title_text="Amplitud Original (uV)", secondary_y=False) - + name = "" + elif label.lower() in selectResp['Channel_Names'][33].lower(): + # CEPAP + name = "" + + if name != "": + DSinNan = d[0][~np.isnan(d[0])] # Eliminar NaN antes de calcular min y max + if len(DSinNan) != 0: + maximo = DSinNan.max() + minimo = DSinNan.min() + media = np.mean(DSinNan) + mediana = np.median(DSinNan) + std = DSinNan.std() + write = False + if name == "Nasal" and UsedNasal< d[-1]: + UsedNasal = d[-1] + write = True + elif name == "Chest" and UsedChest< d[-1]: + UsedChest = d[-1] + write = True + elif name == "Abdomen" and UsedAbdomen< d[-1]: + UsedAbdomen = d[-1] + write = True + elif name == "Flow" and UsedFlow< d[-1]: + UsedFlow = d[-1] + write = True + elif name == "SpO2" and UsedSpO2< d[-1]: + UsedSpO2 = d[-1] + write = True + elif name == "CEPAP" and UsedCepap < d[-1]: + UsedCepap = d[-1] + write = True + if write: + resultados.update({ + name+"_Peakedness_Max": maximo, + name+"_Peakedness_Min": minimo, + name+"_Peakedness_Mean": media, + name+"_Peakedness_Median": mediana, + name+"_Peakedness_Std": std + }) + + elif label.lower() in selectResp['Channel_Names'][34].lower(): + #O2 SATURATION + if np.max(data) < 2: + data = np.round((data/1.055)*100) + + lim = 0.7 + # Quitar los valores por debajo de lim y sus 10 valores anteriores y posteriores para quedarnos solo con los eventos de desaturación + dataReal = data.copy() + for i in range(len(data)): + if data[i] < lim: + start = int(max(0, i-fs*2)) + end = int(min(len(data), i+fs*2)) + dataReal[start:end] = np.nan # Marcar los valores por debajo del límite y sus alrededores como NaN + + CET90 = dataReal[dataReal < 90] + # CET90SinNan = CET90[~np.isnan(CET90)] # Eliminar NaN antes de calcular min y max + CET90 = len(CET90)/len(data) + dataRealSinNan = dataReal[~np.isnan(dataReal)] # Eliminar NaN antes de calcular min y max + if len(dataRealSinNan)>0: + maximo = dataRealSinNan.max() + minimo = dataRealSinNan.min() + std = dataRealSinNan.std() + media = dataRealSinNan.mean() + ODI_mean, ODI_deepness = Resp_features.ODI_application(dataReal, fs, plotflag=False, subjet=1) + + resultados.update({"SpO2_Max": maximo, + "SpO2_Min": minimo, + "SpO2_Mean": media, + "SpO2_Std": std, + "CET90": CET90, + "ODI_Mean": ODI_mean, + "ODI_deepness": ODI_deepness, + }) + + return pd.DataFrame(resultados) From de1a5cfbf2da2151cd2cbdae825b08fa2f9013e5 Mon Sep 17 00:00:00 2001 From: rolopu1 <718694@unizar.es> Date: Fri, 27 Mar 2026 13:58:12 +0100 Subject: [PATCH 25/38] Cambio de tipo de datos a np.array y concatenar features --- src/eeg_processing.py | 2 +- src/resp_processing.py | 2 +- team_code.py | 15 +++++++++++++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/eeg_processing.py b/src/eeg_processing.py index 760b474..df1303f 100644 --- a/src/eeg_processing.py +++ b/src/eeg_processing.py @@ -137,5 +137,5 @@ def processEEG(physiological_data, physiological_fs, csv_path): **d, **patient_summar }) - df_results = pd.DataFrame(results) + df_results = np.array(results) return df_results \ No newline at end of file diff --git a/src/resp_processing.py b/src/resp_processing.py index d8fa8bf..8bc284c 100644 --- a/src/resp_processing.py +++ b/src/resp_processing.py @@ -135,4 +135,4 @@ def processResp(physiological_data, physiological_fs, csv_path): "ODI_deepness": ODI_deepness, }) - return pd.DataFrame(resultados) + return np.array(resultados) diff --git a/team_code.py b/team_code.py index 2185a52..7296cc1 100644 --- a/team_code.py +++ b/team_code.py @@ -22,7 +22,8 @@ from tqdm import tqdm from helper_code import * - +from src.resp_processing import processResp +from src.eeg_processing import processEEG ################################################################################ # Path & Constant Configuration (Added for Robustness) ################################################################################ @@ -90,7 +91,17 @@ def process_training_record(record, data_folder, demographics_cache, diagnosis_c physiological_fs, csv_path=csv_path ) - + resp_features = processResp( + physiological_data, + physiological_fs, + csv_path=csv_path + ) + eeg_features = processEEG( + physiological_data, + physiological_fs, + csv_path=csv_path + ) + physiological_features = np.concatenate([physiological_features, resp_features, eeg_features], axis = 1) algorithmic_annotations_file = os.path.join( data_folder, ALGORITHMIC_ANNOTATIONS_SUBFOLDER, From 6ea8ddf4caf583b377cdcb66c7dfb74521a0b7d7 Mon Sep 17 00:00:00 2001 From: rolopu1 <718694@unizar.es> Date: Fri, 27 Mar 2026 14:14:44 +0100 Subject: [PATCH 26/38] =?UTF-8?q?A=C3=B1adir=20ficheros=20Sofia?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/lib/compute_hrv_hrf.py | 155 +++++++++++++++++++++++++++ src/lib/interpolate_NN.py | 40 +++++++ src/lib/openECGfunction.py | 39 +++++++ src/lib/pan_tompkins.py | 184 +++++++++++++++++++++++++++++++++ src/lib/remove_ectopic_beat.py | 41 ++++++++ src/main_ECG_ver2.py | 152 +++++++++++++++++++++++++++ 6 files changed, 611 insertions(+) create mode 100644 src/lib/compute_hrv_hrf.py create mode 100644 src/lib/interpolate_NN.py create mode 100644 src/lib/openECGfunction.py create mode 100644 src/lib/pan_tompkins.py create mode 100644 src/lib/remove_ectopic_beat.py create mode 100644 src/main_ECG_ver2.py diff --git a/src/lib/compute_hrv_hrf.py b/src/lib/compute_hrv_hrf.py new file mode 100644 index 0000000..15b0422 --- /dev/null +++ b/src/lib/compute_hrv_hrf.py @@ -0,0 +1,155 @@ +import numpy as np +from scipy.signal import lombscargle + +def compute_HRV_HRF(NN, SF): + """ + NN: array of NN intervals in seconds + SF: sampling frequency (Hz) + """ + + NN = np.asarray(NN).flatten() + + # =============================== + # ΔNN + # =============================== + dNN = np.diff(NN) + + n = 1 + thr = n / SF + + # Classification + acc = dNN <= -thr + dec = dNN >= thr + noch = (dNN > -thr) & (dNN < thr) + + # Sign representation + sign_dNN = np.zeros_like(dNN) + sign_dNN[acc] = -1 + sign_dNN[dec] = 1 + + N = len(dNN) + + # =============================== + # PIP (Inflection Points) + # =============================== + inflection = 0 + + for i in range(N - 1): + if (dNN[i+1] * dNN[i] <= 0) and (dNN[i+1] != dNN[i]): + inflection += 1 + + PIP = (inflection / (N - 1)) * 100 if N > 1 else np.nan + + # =============================== + # Segment Detection + # =============================== + segments = [] + if N > 0: + current_seg = sign_dNN[0] + length_seg = 1 + + for i in range(1, N): + if sign_dNN[i] == current_seg and sign_dNN[i] != 0: + length_seg += 1 + else: + if current_seg != 0: + segments.append(length_seg) + current_seg = sign_dNN[i] + length_seg = 1 + + # Add last segment + if current_seg != 0: + segments.append(length_seg) + + segments = np.array(segments) + + # =============================== + # PNNLS & PNNSS + # =============================== + if len(segments) > 0: + long_segments = segments[segments >= 3] + short_segments = segments[segments < 3] + + PNNLS = np.sum(long_segments) / N * 100 + PNNSS = np.sum(short_segments) / np.sum(segments) * 100 + else: + PNNLS = np.nan + PNNSS = np.nan + + # =============================== + # Time-domain HRV + # =============================== + win_length = 300 # seconds + + time = np.cumsum(NN) + + AVNN_all = [] + SDNN_all = [] + RMSSD_all = [] + + i = 0 + while i < len(NN): + t_start = time[i] + t_end = t_start + win_length + + idx = np.where((time >= t_start) & (time < t_end))[0] + + if len(idx) >= 150: + NN_win = NN[idx] + + AVNN_all.append(np.nanmean(NN_win)) + SDNN_all.append(np.nanstd(NN_win, ddof=1)) + + diffNN = np.diff(NN_win) + RMSSD_all.append(np.sqrt(np.nanmean(diffNN**2))) + + next_i = np.where(time >= t_end)[0] + if len(next_i) == 0: + break + i = next_i[0] + + AVNN = np.nanmean(AVNN_all) if len(AVNN_all) > 0 else np.nan + SDNN = np.nanmean(SDNN_all) if len(SDNN_all) > 0 else np.nan + RMSSD = np.nanmean(RMSSD_all) if len(RMSSD_all) > 0 else np.nan + + # =============================== + # Frequency-domain (HF) + # =============================== + HF_all = [] + + for _ in range(len(AVNN_all)): + # NOTE: simplified like MATLAB version + NN_win = NN.copy() + t_win = np.cumsum(NN_win) + + # Convert to angular frequency + f = np.linspace(0.01, 0.5, 1000) + angular_f = 2 * np.pi * f + + # Remove mean (important for Lomb) + NN_detrended = NN_win - np.mean(NN_win) + + Pxx = lombscargle(t_win, NN_detrended, angular_f, normalize=True) + + HF_band = (f >= 0.15) & (f <= 0.4) + + HF_power = np.trapezoid(Pxx[HF_band], f[HF_band]) + + HF_all.append(HF_power) + + HF = np.nanmean(HF_all) if len(HF_all) > 0 else np.nan + + # =============================== + # OUTPUT + # =============================== + results = { + "PIP": PIP, + "PNNLS": PNNLS, + "PNNSS": PNNSS, + "AVNN": AVNN, + "SDNN": SDNN, + "RMSSD": RMSSD, + "HF": HF + } + + return results \ No newline at end of file diff --git a/src/lib/interpolate_NN.py b/src/lib/interpolate_NN.py new file mode 100644 index 0000000..987019f --- /dev/null +++ b/src/lib/interpolate_NN.py @@ -0,0 +1,40 @@ +import numpy as np +from scipy.interpolate import PchipInterpolator + +def interpolate_NN_pchip(NN, maxGap): + """ + NN: array of NN intervals (seconds) + maxGap: max number of consecutive NaNs allowed for interpolation + """ + + NN = np.asarray(NN).flatten() + NN_interp = NN.copy() + + nan_idx = np.isnan(NN) + + # Find NaN segments + d = np.diff(np.concatenate(([0], nan_idx.astype(int), [0]))) + start_idx = np.where(d == 1)[0] + end_idx = np.where(d == -1)[0] - 1 + + for k in range(len(start_idx)): + seg_len = end_idx[k] - start_idx[k] + 1 + + if seg_len <= maxGap: + left = start_idx[k] - 1 + right = end_idx[k] + 1 + + # Check bounds + if (left >= 0 and right < len(NN) and + not np.isnan(NN[left]) and not np.isnan(NN[right])): + + x = np.array([left, right]) + y = np.array([NN[left], NN[right]]) + + xi = np.arange(start_idx[k], end_idx[k] + 1) + + # PCHIP interpolation + interpolator = PchipInterpolator(x, y) + NN_interp[xi] = interpolator(xi) + + return NN_interp \ No newline at end of file diff --git a/src/lib/openECGfunction.py b/src/lib/openECGfunction.py new file mode 100644 index 0000000..5937f76 --- /dev/null +++ b/src/lib/openECGfunction.py @@ -0,0 +1,39 @@ +import pyedflib +from main_ECG_ver2 import ECGprocessing +import pandas as pd +def openECG(physiological_data_file, patient_id): + + f = pyedflib.EdfReader(physiological_data_file) + + signal_labels = f.getSignalLabels() + print(signal_labels) + + ecg_keywords = ['ecg', 'ekg'] + + idx = None + for i, label in enumerate(signal_labels): + label_clean = label.lower().strip() + + # Check if any ECG keyword is inside the label + if any(keyword in label_clean for keyword in ecg_keywords): + idx = i + break # ✅ first ECG channel only + + if idx is None: + raise ValueError("No ECG channel found") + + print("ECG channel:", signal_labels[idx]) + + ecg_signal = f.readSignal(idx) + fs = f.getSampleFrequency(idx) + + f.close() + + all_results = ECGprocessing(ecg_signal, fs, patient_id) + + if all_results is not None: + all_patients_ECGresults = pd.concat( + [all_patients_ECGresults, all_results], + ignore_index=True + ) + return all_patients_ECGresults \ No newline at end of file diff --git a/src/lib/pan_tompkins.py b/src/lib/pan_tompkins.py new file mode 100644 index 0000000..3f37214 --- /dev/null +++ b/src/lib/pan_tompkins.py @@ -0,0 +1,184 @@ +import numpy as np +from scipy.signal import butter, filtfilt, find_peaks + + +def pan_tompkin(ecg, fs, gr=0): + + ecg = np.asarray(ecg).flatten() + delay = 0 + + skip = 0 + m_selected_RR = 0 + mean_RR = 0 + ser_back = 0 + + # ===================== FILTERING ===================== # + ecg = ecg - np.mean(ecg) + + if fs == 200: + # Low-pass + b, a = butter(3, 12*2/fs, btype='low') + ecg_l = filtfilt(b, a, ecg) + ecg_l = ecg_l / np.max(np.abs(ecg_l)) + + # High-pass + b, a = butter(3, 5*2/fs, btype='high') + ecg_h = filtfilt(b, a, ecg_l) + ecg_h = ecg_h / np.max(np.abs(ecg_h)) + else: + b, a = butter(3, [5*2/fs, 15*2/fs], btype='band') + ecg_h = filtfilt(b, a, ecg) + ecg_h = ecg_h / np.max(np.abs(ecg_h)) + + # ===================== DERIVATIVE ===================== # + if fs != 200: + int_c = int((5 - 1) / (fs * (1/40))) + base = np.array([1, 2, 0, -2, -1]) * (1/8) * fs + x_old = np.linspace(1, 5, 5) + x_new = np.linspace(1, 5, int_c) + b = np.interp(x_new, x_old, base) + else: + b = np.array([1, 2, 0, -2, -1]) * (1/8) * fs + + ecg_d = filtfilt(b, [1], ecg_h) + ecg_d = ecg_d / np.max(np.abs(ecg_d)) + + # ===================== SQUARING ===================== # + ecg_s = ecg_d ** 2 + + # ===================== MOVING WINDOW ===================== # + win = int(round(0.150 * fs)) + ecg_m = np.convolve(ecg_s, np.ones(win)/win, mode='same') + delay += win // 2 + + # ===================== PEAK DETECTION ===================== # + locs, _ = find_peaks(ecg_m, distance=int(0.2 * fs)) + pks = ecg_m[locs] + + LLp = len(pks) + + qrs_i = [] + qrs_c = [] + qrs_i_raw = [] + qrs_amp_raw = [] + + nois_i = [] + nois_c = [] + + # Threshold initialization + THR_SIG = np.max(ecg_m[:2*fs]) / 3 + THR_NOISE = np.mean(ecg_m[:2*fs]) / 2 + SIG_LEV = THR_SIG + NOISE_LEV = THR_NOISE + + THR_SIG1 = np.max(ecg_h[:2*fs]) / 3 + THR_NOISE1 = np.mean(ecg_h[:2*fs]) / 2 + SIG_LEV1 = THR_SIG1 + NOISE_LEV1 = THR_NOISE1 + + Beat_C = 0 + Beat_C1 = 0 + + for i in range(LLp): + + loc = locs[i] + + # Find peak in filtered signal + left = max(0, loc - int(0.150 * fs)) + right = loc + + if right < len(ecg_h): + segment = ecg_h[left:right+1] + if len(segment) > 0: + y_i = np.max(segment) + x_i = np.argmax(segment) + else: + continue + else: + continue + + # RR interval update + if len(qrs_i) >= 9: + diffRR = np.diff(qrs_i[-8:]) + mean_RR = np.mean(diffRR) + comp = qrs_i[-1] - qrs_i[-2] + + if comp <= 0.92 * mean_RR or comp >= 1.16 * mean_RR: + THR_SIG *= 0.5 + THR_SIG1 *= 0.5 + else: + m_selected_RR = mean_RR + + test_m = m_selected_RR if m_selected_RR else mean_RR + + # ===================== SEARCH BACK ===================== # + if test_m and len(qrs_i) > 0: + if (loc - qrs_i[-1]) >= int(1.66 * test_m): + + sb_left = qrs_i[-1] + int(0.2 * fs) + sb_right = loc - int(0.2 * fs) + + if sb_right > sb_left: + segment = ecg_m[sb_left:sb_right] + if len(segment) > 0: + pks_temp = np.max(segment) + locs_temp = sb_left + np.argmax(segment) + + if pks_temp > THR_NOISE: + qrs_c.append(pks_temp) + qrs_i.append(locs_temp) + + seg = ecg_h[max(0, locs_temp-int(0.150*fs)):locs_temp] + if len(seg) > 0: + y_i_t = np.max(seg) + x_i_t = np.argmax(seg) + + if y_i_t > THR_NOISE1: + qrs_i_raw.append(locs_temp - int(0.150*fs) + x_i_t) + qrs_amp_raw.append(y_i_t) + SIG_LEV1 = 0.25*y_i_t + 0.75*SIG_LEV1 + + SIG_LEV = 0.25*pks_temp + 0.75*SIG_LEV + + # ===================== CLASSIFICATION ===================== # + if pks[i] >= THR_SIG: + + # T-wave rejection + if len(qrs_i) >= 3: + if (loc - qrs_i[-1]) <= int(0.36 * fs): + + slope1 = np.mean(np.diff(ecg_m[max(0, loc-int(0.075*fs)):loc])) + slope2 = np.mean(np.diff(ecg_m[max(0, qrs_i[-1]-int(0.075*fs)):qrs_i[-1]])) + + if abs(slope1) <= 0.5 * abs(slope2): + NOISE_LEV1 = 0.125*y_i + 0.875*NOISE_LEV1 + NOISE_LEV = 0.125*pks[i] + 0.875*NOISE_LEV + continue + + # Accept QRS + qrs_c.append(pks[i]) + qrs_i.append(loc) + + if y_i >= THR_SIG1: + qrs_i_raw.append(loc - int(0.150*fs) + x_i) + qrs_amp_raw.append(y_i) + SIG_LEV1 = 0.125*y_i + 0.875*SIG_LEV1 + + SIG_LEV = 0.125*pks[i] + 0.875*SIG_LEV + + elif THR_NOISE <= pks[i] < THR_SIG: + NOISE_LEV1 = 0.125*y_i + 0.875*NOISE_LEV1 + NOISE_LEV = 0.125*pks[i] + 0.875*NOISE_LEV + + else: + NOISE_LEV1 = 0.125*y_i + 0.875*NOISE_LEV1 + NOISE_LEV = 0.125*pks[i] + 0.875*NOISE_LEV + + # Update thresholds + THR_SIG = NOISE_LEV + 0.25 * abs(SIG_LEV - NOISE_LEV) + THR_NOISE = 0.5 * THR_SIG + + THR_SIG1 = NOISE_LEV1 + 0.25 * abs(SIG_LEV1 - NOISE_LEV1) + THR_NOISE1 = 0.5 * THR_SIG1 + + return np.array(qrs_amp_raw), np.array(qrs_i_raw), delay \ No newline at end of file diff --git a/src/lib/remove_ectopic_beat.py b/src/lib/remove_ectopic_beat.py new file mode 100644 index 0000000..de60caa --- /dev/null +++ b/src/lib/remove_ectopic_beat.py @@ -0,0 +1,41 @@ +import numpy as np + +def remove_ectopic_beats(NN, window_size, threshold): + NN = np.asarray(NN).flatten() + NN_corrected = NN.copy() + + half_win = window_size // 2 + ectopic_count = 0 + valid_count = 0 + + for i in range(len(NN)): + + if np.isnan(NN[i]): + continue + + valid_count += 1 + + # Define local window + left = max(0, i - half_win) + right = min(len(NN), i + half_win + 1) # Python slice is exclusive + + local_segment = NN[left:right] + local_segment = local_segment[~np.isnan(local_segment)] + + if local_segment.size == 0: + continue + + med_val = np.median(local_segment) + + # Detect ectopic + if abs(NN[i] - med_val) > threshold * med_val: + NN_corrected[i] = med_val + ectopic_count += 1 + + # Percentage over valid NN + if valid_count > 0: + ectopic_perc = (ectopic_count / valid_count) * 100 + else: + ectopic_perc = np.nan + + return NN_corrected, ectopic_perc \ No newline at end of file diff --git a/src/main_ECG_ver2.py b/src/main_ECG_ver2.py new file mode 100644 index 0000000..03aaa63 --- /dev/null +++ b/src/main_ECG_ver2.py @@ -0,0 +1,152 @@ +import numpy as np +import pandas as pd +from scipy.signal import butter, filtfilt, resample +from lib.pan_tompkins import pan_tompkin +from lib.compute_hrv_hrf import compute_HRV_HRF +from lib.interpolate_NN import interpolate_NN_pchip +from lib.remove_ectopic_beat import remove_ectopic_beats +def ECGprocessing(ecg_signal, fs, patient_id): + + all_results = pd.DataFrame() + + ecg_signal = ecg_signal - np.mean(ecg_signal) + + # =============================== + # RESAMPLE TO 200 Hz IF NEEDED + # =============================== + target_fs = 200 + + if fs != target_fs: + num_samples = int(len(ecg_signal) * target_fs / fs) + ecg_signal = resample(ecg_signal, num_samples) + fs = target_fs + + # =============================== + # SEGMENT INTO 5-MIN WINDOWS + # =============================== + win_sec = 300 + win_samples = int(win_sec * fs) + + N = len(ecg_signal) + n_windows = N // win_samples + + if n_windows == 0: + print("Signal too short.") + return None + + # =============================== + # FIND VALID WINDOWS + # =============================== + valid_windows = [] + + for w in range(n_windows): + + idx_start = w * win_samples + idx_end = (w + 1) * win_samples + + ecg_win = ecg_signal[idx_start:idx_end] + + # Quality check + if np.sum(np.isnan(ecg_win)) != 0 or np.sum(ecg_win == 0) > 0.2 * len(ecg_win): + continue + + valid_windows.append(w) + + # =============================== + # PROCESS WINDOWS + # =============================== + HRV_all = [] + + for w in valid_windows: + + idx_start = w * win_samples + idx_end = (w + 1) * win_samples + + ecg_win = ecg_signal[idx_start:idx_end] + + ecg_win = ecg_win - np.mean(ecg_win) + + # --- Filtering --- + # Notch + b, a = butter(3, [59.5/(fs/2), 60.5/(fs/2)], btype='bandstop') + ecg_win = filtfilt(b, a, ecg_win) + + # High-pass + b, a = butter(3, 0.5/(fs/2), btype='high') + ecg_win = filtfilt(b, a, ecg_win) + + # Low-pass + b, a = butter(3, 45/(fs/2), btype='low') + ecg_win = filtfilt(b, a, ecg_win) + + # =============================== + # QRS DETECTION + # =============================== + qrs_amp_raw, R_locs, delay = pan_tompkin(ecg_win, fs, 0) + + if len(R_locs) < 150: + continue + + # =============================== + # NN INTERVALS + # =============================== + NN = np.diff(R_locs) / fs + + # =============================== + # HRV PREPROCESSING + # =============================== + NN, ectopic_perc = remove_ectopic_beats(NN, 40, 0.10) + + NN = interpolate_NN_pchip(NN, 2) + + valid_ratio = np.sum(~np.isnan(NN)) / len(NN) + NN = NN[~np.isnan(NN)] + + if valid_ratio < 0.75: + continue + + # =============================== + # HRV + HRF METRICS + # =============================== + res = compute_HRV_HRF(NN, fs) + + meanNN = np.mean(NN) + + HRV_all.append([ + meanNN, + res["PIP"], res["PNNLS"], res["PNNSS"], + res["AVNN"], res["SDNN"], res["RMSSD"], res["HF"], + ectopic_perc + ]) + + # =============================== + # SUBJECT-LEVEL METRICS + # =============================== + if len(HRV_all) == 0: + print("No valid windows.") + return None + + HRV_all = np.array(HRV_all) + + median_vals = np.nanmedian(HRV_all, axis=0) + std_vals = np.nanstd(HRV_all, axis=0) + + # =============================== + # SAVE RESULTS (DataFrame row) + # =============================== + row = pd.DataFrame([{ + "ID": patient_id, + "mNNmed": median_vals[0], "mNNstd": std_vals[0], + "PIP_med": median_vals[1], "PIP_std": std_vals[1], + "PNNLS_med": median_vals[2], "PNNLS_std": std_vals[2], + "PNNSS_med": median_vals[3], "PNNSS_std": std_vals[3], + "AVNN_med": median_vals[4], "AVNN_std": std_vals[4], + "SDNN_med": median_vals[5], "SDNN_std": std_vals[5], + "RMSSD_med": median_vals[6], "RMSSD_std": std_vals[6], + "HF_med": median_vals[7], "HF_std": std_vals[7], + "ECTOPIC_med": median_vals[8], "ECTOPIC_std": std_vals[8], + }]) + + all_results = pd.concat([all_results, row], ignore_index=True) + + return all_results \ No newline at end of file From 14e0672a7d70a8aac0e7d7c16b79846125828eb9 Mon Sep 17 00:00:00 2001 From: dcajal Date: Fri, 27 Mar 2026 16:02:46 +0100 Subject: [PATCH 27/38] Fix import errors --- src/__init__.py | 1 + src/eeg_processing.py | 2 +- src/lib/EEG_functions.py | 20 +++++++++++++++++--- src/lib/Resp_features.py | 23 +++++++++++++++++++---- src/lib/__init__.py | 1 + src/lib/openECGfunction.py | 2 +- src/lib/peakedness.py | 19 ++++++++++++++++--- src/main_ECG_ver2.py | 8 ++++---- src/resp_processing.py | 2 +- src/results_analysis.py | 2 +- 10 files changed, 62 insertions(+), 18 deletions(-) create mode 100644 src/__init__.py create mode 100644 src/lib/__init__.py diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..6319343 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +"""Project source package.""" \ No newline at end of file diff --git a/src/eeg_processing.py b/src/eeg_processing.py index df1303f..04147c6 100644 --- a/src/eeg_processing.py +++ b/src/eeg_processing.py @@ -34,7 +34,7 @@ import pandas as pd import numpy as np import helper_code as helper_code -import lib.EEG_functions as EEG_functions +from .lib import EEG_functions sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) diff --git a/src/lib/EEG_functions.py b/src/lib/EEG_functions.py index 5427f60..83e8ef6 100644 --- a/src/lib/EEG_functions.py +++ b/src/lib/EEG_functions.py @@ -1,12 +1,21 @@ from scipy.signal import butter, filtfilt import numpy as np -import plotly.graph_objects as go -from plotly.subplots import make_subplots from scipy.signal import welch import pandas as pd from scipy import signal from scipy.stats import kurtosis, entropy -import matplotlib.pyplot as plt + +try: + import plotly.graph_objects as go + from plotly.subplots import make_subplots +except ModuleNotFoundError: + go = None + make_subplots = None + +try: + import matplotlib.pyplot as plt +except ModuleNotFoundError: + plt = None def butter_bandpass_filter(data, lowcut, highcut, fs, order=4): nyq = 0.5 * fs # Frecuencia de Nyquist @@ -32,6 +41,8 @@ def butter_bandpass_filter(data, lowcut, highcut, fs, order=4): return y def plot_EEG(df, columns, fs = 200): + if go is None or make_subplots is None: + raise ModuleNotFoundError("plotly is required for plot_EEG") fig = make_subplots(rows=len(columns), cols=1, shared_xaxes=True, @@ -55,6 +66,9 @@ def plot_EEG(df, columns, fs = 200): fig.show() def plot_EEG_sel(sel, name = "EEG_plot_raw.html"): + if go is None or make_subplots is None: + raise ModuleNotFoundError("plotly is required for plot_EEG_sel") + fig = make_subplots(rows=len(sel), cols=1, shared_xaxes=True, vertical_spacing=0.02, diff --git a/src/lib/Resp_features.py b/src/lib/Resp_features.py index 553c9fa..216b3ab 100644 --- a/src/lib/Resp_features.py +++ b/src/lib/Resp_features.py @@ -1,18 +1,29 @@ import pandas as pd import numpy as np -import plotly.graph_objs as go -from lib.peakedness import peakednessCost +from .peakedness import peakednessCost from scipy.interpolate import interp1d -import matplotlib.pyplot as plt from scipy.stats import kruskal from scipy.signal import resample, detrend import scipy.fft as fft from scipy.signal import butter, filtfilt +try: + import plotly.graph_objs as go +except ModuleNotFoundError: + go = None + +try: + import matplotlib.pyplot as plt +except ModuleNotFoundError: + plt = None + def plot_resp(Data, subjet = 1, DownPrinting = 2): """ Plot resp data using Plotly. """ + if go is None: + raise ModuleNotFoundError("plotly is required for plot_resp") + if type(Data) == dict: Data = pd.DataFrame(Data[str(subjet)]) Data = Data.iloc[::DownPrinting, :] @@ -119,7 +130,8 @@ def ODI_application(data, fs, plotflag=True, subjet=1): odi_deepness = np.mean(magnitudes) if magnitudes else 0.0 if plotflag: - import matplotlib.pyplot as plt + if plt is None: + raise ModuleNotFoundError("matplotlib is required when plotflag=True") times = np.arange(len(sp)) / fs / 60.0 # minutos plt.figure(figsize=(10, 4)) plt.plot(times, sp.values, label='SpO2') @@ -292,6 +304,9 @@ def Significance_tests(RespData): results = results.reset_index() results.to_excel('./Graphs/kruskal_results.xlsx', index=False) + if plt is None: + raise ModuleNotFoundError("matplotlib is required for Significance_tests plotting") + plt.plot(results['index'], results['p_value']) plt.axhline(y=0.05, color='r', linestyle='--') plt.xlabel('Métrica') diff --git a/src/lib/__init__.py b/src/lib/__init__.py new file mode 100644 index 0000000..665f43f --- /dev/null +++ b/src/lib/__init__.py @@ -0,0 +1 @@ +"""Signal-processing helper package.""" \ No newline at end of file diff --git a/src/lib/openECGfunction.py b/src/lib/openECGfunction.py index 5937f76..745b1ac 100644 --- a/src/lib/openECGfunction.py +++ b/src/lib/openECGfunction.py @@ -1,5 +1,5 @@ import pyedflib -from main_ECG_ver2 import ECGprocessing +from ..main_ECG_ver2 import ECGprocessing import pandas as pd def openECG(physiological_data_file, patient_id): diff --git a/src/lib/peakedness.py b/src/lib/peakedness.py index 6323aca..9c47f96 100644 --- a/src/lib/peakedness.py +++ b/src/lib/peakedness.py @@ -3,12 +3,21 @@ from numpy.fft import fftshift from numpy.fft import fft from scipy.signal import detrend, find_peaks -import matplotlib.pyplot as plt -import plotly.graph_objs as go -from plotly import subplots from time import time import os +try: + import matplotlib.pyplot as plt +except ModuleNotFoundError: + plt = None + +try: + import plotly.graph_objs as go + from plotly import subplots +except ModuleNotFoundError: + go = None + subplots = None + def setParamFr(Setup): if 'DT' not in Setup.keys(): Setup["DT"] = 5 @@ -265,6 +274,8 @@ def init_module(kk,vars,param, plotflag): # vars["bar_fr"][kk] = f[fj] if plotflag: + if plt is None: + raise ModuleNotFoundError("matplotlib is required when plotflag=True") plt.plot(f, averS) plt.plot(f[fj], averS[fj], '-') plt.title('Initialization - Averaged Spectrum') @@ -605,6 +616,8 @@ def peakednessCost(signals, ts, fs, Setup = {}, title = "", storeGraph = False, t_fin = time() if plotflag: + if go is None or subplots is None: + raise ModuleNotFoundError("plotly is required when plotflag=True") fig = subplots.make_subplots(rows=2,shared_xaxes=True, subplot_titles=('Peak-condition averaged EDR Spectra in '+title,"EDR/RESP signals"), row_heights=[0.7, 0.3]) diff --git a/src/main_ECG_ver2.py b/src/main_ECG_ver2.py index 03aaa63..6206e23 100644 --- a/src/main_ECG_ver2.py +++ b/src/main_ECG_ver2.py @@ -1,10 +1,10 @@ import numpy as np import pandas as pd from scipy.signal import butter, filtfilt, resample -from lib.pan_tompkins import pan_tompkin -from lib.compute_hrv_hrf import compute_HRV_HRF -from lib.interpolate_NN import interpolate_NN_pchip -from lib.remove_ectopic_beat import remove_ectopic_beats +from .lib.pan_tompkins import pan_tompkin +from .lib.compute_hrv_hrf import compute_HRV_HRF +from .lib.interpolate_NN import interpolate_NN_pchip +from .lib.remove_ectopic_beat import remove_ectopic_beats def ECGprocessing(ecg_signal, fs, patient_id): all_results = pd.DataFrame() diff --git a/src/resp_processing.py b/src/resp_processing.py index 8bc284c..c158d2b 100644 --- a/src/resp_processing.py +++ b/src/resp_processing.py @@ -1,4 +1,4 @@ -import lib.Resp_features as Resp_features +from .lib import Resp_features import sys import os import pandas as pd diff --git a/src/results_analysis.py b/src/results_analysis.py index c2fef8b..b415a4b 100644 --- a/src/results_analysis.py +++ b/src/results_analysis.py @@ -5,7 +5,7 @@ import plotly.express as px sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -import lib.EEG_functions as EEG_functions +from src.lib import EEG_functions import seaborn as sns import matplotlib.pyplot as plt import plotly.express as px From 6c4f780e55f3221b9f02856c084abac67534fe1d Mon Sep 17 00:00:00 2001 From: rolopu1 <718694@unizar.es> Date: Fri, 27 Mar 2026 17:33:41 +0100 Subject: [PATCH 28/38] Bypass CPAP --- src/resp_processing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/resp_processing.py b/src/resp_processing.py index c158d2b..245144a 100644 --- a/src/resp_processing.py +++ b/src/resp_processing.py @@ -61,6 +61,7 @@ def processResp(physiological_data, physiological_fs, csv_path): print(f"Warning: All values in the signal data for {label} are zero. Skipping feature extraction for this channel.") else: name = "" + name = "" #Bypass CPAP elif label.lower() in selectResp['Channel_Names'][33].lower(): # CEPAP name = "" From 4fe3992d51a3096e15e112488457a73674221052 Mon Sep 17 00:00:00 2001 From: rolopu1 <718694@unizar.es> Date: Fri, 27 Mar 2026 17:51:48 +0100 Subject: [PATCH 29/38] Correccion EEG --- src/eeg_processing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/eeg_processing.py b/src/eeg_processing.py index 04147c6..eca4b44 100644 --- a/src/eeg_processing.py +++ b/src/eeg_processing.py @@ -93,8 +93,9 @@ def processEEG(physiological_data, physiological_fs, csv_path): labels.append(col) # columns = Bipolar.columns.tolist() else: + labels2 = labels labels = [] - for l in labels: + for l in labels2: # print(f"Archivo: {file}, Canal: {sig.label}, Frecuencia de muestreo: {sig.sampling_frequency} Hz, Duración: {len(sig.data)/sig.sampling_frequency:.2f} segundos") fs = physiological_fs[l] fil = EEG_functions.butter_bandpass_filter(physiological_data[l], lowcut=0.3, highcut=35, fs=fs, order=4) From 7bf25001fa0ca8bdbd88348abcb00805800958d3 Mon Sep 17 00:00:00 2001 From: dcajal Date: Fri, 27 Mar 2026 22:41:30 +0100 Subject: [PATCH 30/38] Update smoke dataset configuration to use 10 subjects by default and enhance demographic filtering process --- docs/03_smoke_dataset.md | 5 +-- scripts/create_smoke.ps1 | 69 ++++++++++++++++++++++++++++++++++++---- scripts/create_smoke.sh | 66 ++++++++++++++++++++++++++++++++------ 3 files changed, 122 insertions(+), 18 deletions(-) diff --git a/docs/03_smoke_dataset.md b/docs/03_smoke_dataset.md index 5ba827b..7c0e3d7 100644 --- a/docs/03_smoke_dataset.md +++ b/docs/03_smoke_dataset.md @@ -2,7 +2,7 @@ Entrenar con el dataset completo tarda aproximadamente 30–40 minutos con el modelo de ejemplo. -Para desarrollo utilizamos un dataset reducido (5 sujetos). +Para desarrollo utilizamos un dataset reducido (10 sujetos por defecto). Este documento describe cuándo y por qué usar smoke. Los comandos de ejecución están centralizados en `docs/04_run_script.md`. @@ -11,9 +11,10 @@ Los comandos de ejecución están centralizados en `docs/04_run_script.md`. ## Qué incluye -- Muestra reducida del dataset (5 sujetos) +- Muestra reducida del dataset (10 sujetos por defecto) - Estructura compatible con el flujo oficial del proyecto - Directorio de salida en `data/training_smoke/` +- `demographics.csv` filtrado para que solo incluya los registros copiados al smoke ## Para qué se usa diff --git a/scripts/create_smoke.ps1 b/scripts/create_smoke.ps1 index bd39bcd..d66aaff 100644 --- a/scripts/create_smoke.ps1 +++ b/scripts/create_smoke.ps1 @@ -10,7 +10,7 @@ $FULL_DATA_PATH = "data/training_set" # <-- CHANGE THIS IF NEEDED $SMOKE_PATH = "data/training_smoke" -$N_RECORDS = 5 +$N_RECORDS = 10 Write-Host "Creating smoke dataset..." Write-Host "Source: $FULL_DATA_PATH" @@ -19,8 +19,7 @@ Write-Host "Destination: $SMOKE_PATH" Remove-Item -Recurse -Force $SMOKE_PATH -ErrorAction SilentlyContinue New-Item -ItemType Directory -Force -Path $SMOKE_PATH | Out-Null -# Copy demographics -Copy-Item "$FULL_DATA_PATH/demographics.csv" "$SMOKE_PATH/demographics.csv" +$selectedRecords = New-Object System.Collections.Generic.List[object] # Select first N EDF files $edfs = Get-ChildItem "$FULL_DATA_PATH/physiological_data" -Recurse -Filter *.edf | @@ -32,10 +31,68 @@ foreach ($f in $edfs) { $target = Join-Path $SMOKE_PATH $rel New-Item -ItemType Directory -Force -Path (Split-Path $target) | Out-Null Copy-Item $f.FullName $target + + $stem = [System.IO.Path]::GetFileNameWithoutExtension($f.Name) + $parts = $stem -split '_ses-' + $selectedRecords.Add([pscustomobject]@{ + SiteID = $f.Directory.Name + Patient = $parts[0] + Session = $parts[1] + }) | Out-Null } -# Copy full annotation folders (simpler and robust) -Copy-Item "$FULL_DATA_PATH/algorithmic_annotations" "$SMOKE_PATH/algorithmic_annotations" -Recurse -ErrorAction SilentlyContinue -Copy-Item "$FULL_DATA_PATH/human_annotations" "$SMOKE_PATH/human_annotations" -Recurse -ErrorAction SilentlyContinue +# Copy only annotation EDFs for the selected smoke records. +foreach ($record in $selectedRecords) { + $algoSource = Join-Path $FULL_DATA_PATH "algorithmic_annotations/$($record.SiteID)/$($record.Patient)_ses-$($record.Session)_caisr_annotations.edf" + $algoTarget = Join-Path $SMOKE_PATH "algorithmic_annotations/$($record.SiteID)/$($record.Patient)_ses-$($record.Session)_caisr_annotations.edf" + if (Test-Path $algoSource) { + New-Item -ItemType Directory -Force -Path (Split-Path $algoTarget) | Out-Null + Copy-Item $algoSource $algoTarget + } + + $humanSource = Join-Path $FULL_DATA_PATH "human_annotations/$($record.SiteID)/$($record.Patient)_ses-$($record.Session)_expert_annotations.edf" + $humanTarget = Join-Path $SMOKE_PATH "human_annotations/$($record.SiteID)/$($record.Patient)_ses-$($record.Session)_expert_annotations.edf" + if (Test-Path $humanSource) { + New-Item -ItemType Directory -Force -Path (Split-Path $humanTarget) | Out-Null + Copy-Item $humanSource $humanTarget + } +} + +# Filter demographics to the copied smoke records. +$env:SMOKE_FULL_DATA_PATH = (Resolve-Path $FULL_DATA_PATH).Path +$env:SMOKE_PATH = (Resolve-Path $SMOKE_PATH).Path +python -c @" +import csv +import os +from pathlib import Path + +full_data = Path(os.environ['SMOKE_FULL_DATA_PATH']) +smoke_path = Path(os.environ['SMOKE_PATH']) + +source_csv = full_data / 'demographics.csv' +target_csv = smoke_path / 'demographics.csv' +phys_root = smoke_path / 'physiological_data' + +selected_records = set() +for edf_path in phys_root.rglob('*.edf'): + site_id = edf_path.parent.name + patient_part, session_part = edf_path.stem.rsplit('_ses-', 1) + selected_records.add((site_id, patient_part, session_part)) + +with source_csv.open('r', newline='', encoding='utf-8') as source_file: + reader = csv.DictReader(source_file) + rows = [ + row for row in reader + if (row['SiteID'], row['BidsFolder'], str(row['SessionID'])) in selected_records + ] + fieldnames = reader.fieldnames + +with target_csv.open('w', newline='', encoding='utf-8') as target_file: + writer = csv.DictWriter(target_file, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) +"@ +Remove-Item Env:SMOKE_FULL_DATA_PATH -ErrorAction SilentlyContinue +Remove-Item Env:SMOKE_PATH -ErrorAction SilentlyContinue Write-Host "Smoke dataset created successfully." \ No newline at end of file diff --git a/scripts/create_smoke.sh b/scripts/create_smoke.sh index 34bd020..9562438 100644 --- a/scripts/create_smoke.sh +++ b/scripts/create_smoke.sh @@ -13,7 +13,7 @@ set -euo pipefail FULL_DATA_PATH="${FULL_DATA_PATH:-data/training_set}" # Override with env var if needed SMOKE_PATH="data/training_smoke" -N_RECORDS="${N_RECORDS:-5}" +N_RECORDS="${N_RECORDS:-10}" echo "Creating smoke dataset..." echo "Source: ${FULL_DATA_PATH}" @@ -22,8 +22,8 @@ echo "Destination: ${SMOKE_PATH}" rm -rf "${SMOKE_PATH}" mkdir -p "${SMOKE_PATH}" -# Copy demographics -cp "${FULL_DATA_PATH}/demographics.csv" "${SMOKE_PATH}/demographics.csv" +selected_records_file="$(mktemp)" +trap 'rm -f "${selected_records_file}"' EXIT # Select first N EDF files while IFS= read -r file_path; do @@ -31,17 +31,63 @@ while IFS= read -r file_path; do target_path="${SMOKE_PATH}/${rel_path}" mkdir -p "$(dirname "${target_path}")" cp "${file_path}" "${target_path}" + stem="$(basename "${file_path}" .edf)" + patient_part="${stem%_ses-*}" + session_part="${stem##*_ses-}" + site_id="$(basename "$(dirname "${file_path}")")" + printf '%s,%s,%s\n' "${site_id}" "${patient_part}" "${session_part}" >> "${selected_records_file}" done < <( find "${FULL_DATA_PATH}/physiological_data" -type f -name "*.edf" | sort | head -n "${N_RECORDS}" ) -# Copy full annotation folders (simpler and robust) -if [[ -d "${FULL_DATA_PATH}/algorithmic_annotations" ]]; then - cp -R "${FULL_DATA_PATH}/algorithmic_annotations" "${SMOKE_PATH}/algorithmic_annotations" -fi +# Copy only annotation EDFs for the selected smoke records. +while IFS=',' read -r site_id patient_part session_part; do + algo_source="${FULL_DATA_PATH}/algorithmic_annotations/${site_id}/${patient_part}_ses-${session_part}_caisr_annotations.edf" + algo_target="${SMOKE_PATH}/algorithmic_annotations/${site_id}/${patient_part}_ses-${session_part}_caisr_annotations.edf" + if [[ -f "${algo_source}" ]]; then + mkdir -p "$(dirname "${algo_target}")" + cp "${algo_source}" "${algo_target}" + fi -if [[ -d "${FULL_DATA_PATH}/human_annotations" ]]; then - cp -R "${FULL_DATA_PATH}/human_annotations" "${SMOKE_PATH}/human_annotations" -fi + human_source="${FULL_DATA_PATH}/human_annotations/${site_id}/${patient_part}_ses-${session_part}_expert_annotations.edf" + human_target="${SMOKE_PATH}/human_annotations/${site_id}/${patient_part}_ses-${session_part}_expert_annotations.edf" + if [[ -f "${human_source}" ]]; then + mkdir -p "$(dirname "${human_target}")" + cp "${human_source}" "${human_target}" + fi +done < "${selected_records_file}" + +# Filter demographics to the copied smoke records. +python - <<'PY' +import csv +from pathlib import Path + +full_data = Path("data/training_set") +smoke_path = Path("data/training_smoke") + +source_csv = full_data / "demographics.csv" +target_csv = smoke_path / "demographics.csv" +phys_root = smoke_path / "physiological_data" + +selected_records = set() +for edf_path in phys_root.rglob("*.edf"): + site_id = edf_path.parent.name + stem = edf_path.stem + patient_part, session_part = stem.rsplit("_ses-", 1) + selected_records.add((site_id, patient_part, session_part)) + +with source_csv.open("r", newline="", encoding="utf-8") as source_file: + reader = csv.DictReader(source_file) + rows = [ + row for row in reader + if (row["SiteID"], row["BidsFolder"], str(row["SessionID"])) in selected_records + ] + fieldnames = reader.fieldnames + +with target_csv.open("w", newline="", encoding="utf-8") as target_file: + writer = csv.DictWriter(target_file, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) +PY echo "Smoke dataset created successfully." From a550325c5fe6af6a786dda7766bbd153252cf2ed Mon Sep 17 00:00:00 2001 From: dcajal Date: Fri, 27 Mar 2026 22:52:10 +0100 Subject: [PATCH 31/38] Update smoke dataset configuration to use 5 subjects by default and enhance feature extraction process --- docs/03_smoke_dataset.md | 4 +-- scripts/create_smoke.ps1 | 2 +- team_code.py | 78 +++++++++++++++++++++++++++++----------- 3 files changed, 61 insertions(+), 23 deletions(-) diff --git a/docs/03_smoke_dataset.md b/docs/03_smoke_dataset.md index 7c0e3d7..ae4ee23 100644 --- a/docs/03_smoke_dataset.md +++ b/docs/03_smoke_dataset.md @@ -2,7 +2,7 @@ Entrenar con el dataset completo tarda aproximadamente 30–40 minutos con el modelo de ejemplo. -Para desarrollo utilizamos un dataset reducido (10 sujetos por defecto). +Para desarrollo utilizamos un dataset reducido (5 sujetos por defecto). Este documento describe cuándo y por qué usar smoke. Los comandos de ejecución están centralizados en `docs/04_run_script.md`. @@ -11,7 +11,7 @@ Los comandos de ejecución están centralizados en `docs/04_run_script.md`. ## Qué incluye -- Muestra reducida del dataset (10 sujetos por defecto) +- Muestra reducida del dataset (5 sujetos por defecto) - Estructura compatible con el flujo oficial del proyecto - Directorio de salida en `data/training_smoke/` - `demographics.csv` filtrado para que solo incluya los registros copiados al smoke diff --git a/scripts/create_smoke.ps1 b/scripts/create_smoke.ps1 index d66aaff..c8f45dd 100644 --- a/scripts/create_smoke.ps1 +++ b/scripts/create_smoke.ps1 @@ -10,7 +10,7 @@ $FULL_DATA_PATH = "data/training_set" # <-- CHANGE THIS IF NEEDED $SMOKE_PATH = "data/training_smoke" -$N_RECORDS = 10 +$N_RECORDS = 5 Write-Host "Creating smoke dataset..." Write-Host "Source: $FULL_DATA_PATH" diff --git a/team_code.py b/team_code.py index 7296cc1..c23df14 100644 --- a/team_code.py +++ b/team_code.py @@ -22,8 +22,8 @@ from tqdm import tqdm from helper_code import * -from src.resp_processing import processResp -from src.eeg_processing import processEEG +from src.resp_processing import RESP_FEATURE_LENGTH, processResp +from src.eeg_processing import EEG_FEATURE_LENGTH, processEEG ################################################################################ # Path & Constant Configuration (Added for Robustness) ################################################################################ @@ -67,6 +67,50 @@ def get_rename_rules(csv_path): return rename_rules +def _coerce_feature_vector(features): + vector = np.asarray(features, dtype=np.float32).reshape(-1) + return np.nan_to_num(vector, nan=0.0, posinf=0.0, neginf=0.0) + + +def _extract_optional_features(extractor, expected_length, *args, **kwargs): + vector = _coerce_feature_vector(extractor(*args, **kwargs)) + if vector.size != expected_length: + raise ValueError( + f"{extractor.__name__} returned {vector.size} features; expected {expected_length}." + ) + return vector + + +def extract_extended_physiological_features(physiological_data, physiological_fs, csv_path=DEFAULT_CSV_PATH): + base_features = _coerce_feature_vector( + extract_physiological_features(physiological_data, physiological_fs, csv_path=csv_path) + ) + + try: + resp_features = _extract_optional_features( + processResp, + RESP_FEATURE_LENGTH, + physiological_data, + physiological_fs, + csv_path=csv_path, + ) + except Exception: + resp_features = np.zeros(RESP_FEATURE_LENGTH, dtype=np.float32) + + try: + eeg_features = _extract_optional_features( + processEEG, + EEG_FEATURE_LENGTH, + physiological_data, + physiological_fs, + csv_path=csv_path, + ) + except Exception: + eeg_features = np.zeros(EEG_FEATURE_LENGTH, dtype=np.float32) + + return np.hstack([base_features, resp_features, eeg_features]).astype(np.float32) + + def process_training_record(record, data_folder, demographics_cache, diagnosis_cache, csv_path): patient_id = record[HEADERS['bids_folder']] site_id = record[HEADERS['site_id']] @@ -86,30 +130,22 @@ def process_training_record(record, data_folder, demographics_cache, diagnosis_c return patient_id, None, None, f"Missing physiological data for {patient_id}. Skipping..." physiological_data, physiological_fs = load_signal_data(physiological_data_file) - physiological_features = extract_physiological_features( - physiological_data, - physiological_fs, - csv_path=csv_path - ) - resp_features = processResp( + physiological_features = extract_extended_physiological_features( physiological_data, physiological_fs, csv_path=csv_path ) - eeg_features = processEEG( - physiological_data, - physiological_fs, - csv_path=csv_path - ) - physiological_features = np.concatenate([physiological_features, resp_features, eeg_features], axis = 1) algorithmic_annotations_file = os.path.join( data_folder, ALGORITHMIC_ANNOTATIONS_SUBFOLDER, site_id, f"{patient_id}_ses-{session_id}_caisr_annotations.edf" ) - algorithmic_annotations, algorithmic_fs = load_signal_data(algorithmic_annotations_file) - algorithmic_features = extract_algorithmic_annotations_features(algorithmic_annotations) + if os.path.exists(algorithmic_annotations_file): + algorithmic_annotations, _ = load_signal_data(algorithmic_annotations_file) + algorithmic_features = extract_algorithmic_annotations_features(algorithmic_annotations) + else: + algorithmic_features = np.zeros(12, dtype=np.float32) label = diagnosis_cache.get(patient_id) @@ -215,6 +251,9 @@ def train_model(data_folder, model_folder, verbose, csv_path=DEFAULT_CSV_PATH): features = np.asarray(features, dtype=np.float32) labels = np.asarray(labels, dtype=bool) + if features.size == 0 or features.ndim != 2 or features.shape[0] == 0: + raise ValueError('No valid training samples were extracted. Review feature extraction logs for the skipped records.') + # Train the models on the features. if verbose: print('Training the model on the data...') @@ -294,10 +333,9 @@ def run_model(model, record, data_folder, verbose): if os.path.exists(phys_file): phys_data, phys_fs = load_signal_data(phys_file) # Ensure csv_path is accessible or defined - physiological_features = extract_physiological_features(phys_data, phys_fs) + physiological_features = extract_extended_physiological_features(phys_data, phys_fs) else: - # Fallback to zeros if file is missing (length 49) - physiological_features = np.zeros(49) + physiological_features = np.zeros(49 + RESP_FEATURE_LENGTH + EEG_FEATURE_LENGTH, dtype=np.float32) # Load Algorithmic Annotations algo_file = os.path.join(data_folder, ALGORITHMIC_ANNOTATIONS_SUBFOLDER, site_id, f"{patient_id}_ses-{session_id}_caisr_annotations.edf") @@ -306,7 +344,7 @@ def run_model(model, record, data_folder, verbose): algorithmic_features = extract_algorithmic_annotations_features(algo_data) else: # Fallback to zeros (length 12) - algorithmic_features = np.zeros(12) + algorithmic_features = np.zeros(12, dtype=np.float32) features = np.hstack([demographic_features, physiological_features, algorithmic_features]).reshape(1, -1) From ca1f9e6c42c2bda54e1284c0dc603099ee2dab53 Mon Sep 17 00:00:00 2001 From: dcajal Date: Fri, 27 Mar 2026 22:55:20 +0100 Subject: [PATCH 32/38] Update smoke dataset configuration to use 5 subjects by default --- scripts/create_smoke.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/create_smoke.sh b/scripts/create_smoke.sh index 9562438..88765ec 100644 --- a/scripts/create_smoke.sh +++ b/scripts/create_smoke.sh @@ -13,7 +13,7 @@ set -euo pipefail FULL_DATA_PATH="${FULL_DATA_PATH:-data/training_set}" # Override with env var if needed SMOKE_PATH="data/training_smoke" -N_RECORDS="${N_RECORDS:-10}" +N_RECORDS="${N_RECORDS:-5}" echo "Creating smoke dataset..." echo "Source: ${FULL_DATA_PATH}" From e1cc32e9f1e80b70681ec668913c9b9f9f1d3373 Mon Sep 17 00:00:00 2001 From: dcajal Date: Fri, 27 Mar 2026 22:58:24 +0100 Subject: [PATCH 33/38] Refactor EEG and respiratory signal processing functions to enhance feature extraction and improve data handling --- src/eeg_processing.py | 209 +++++++++++++++------------- src/lib/Resp_features.py | 2 +- src/lib/peakedness.py | 103 +++++++------- src/resp_processing.py | 291 ++++++++++++++++++++++----------------- 4 files changed, 327 insertions(+), 278 deletions(-) diff --git a/src/eeg_processing.py b/src/eeg_processing.py index 04147c6..cf95237 100644 --- a/src/eeg_processing.py +++ b/src/eeg_processing.py @@ -29,7 +29,7 @@ El módulo depende de `numpy`, `pandas`, `matplotlib`, `plotly` y de las utilidades definidas en `lib/helper_code` y `lib/EEG_functions`. """ -import sys +import sys import os import pandas as pd import numpy as np @@ -39,103 +39,114 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -def processEEG(physiological_data, physiological_fs, csv_path): +EEG_FEATURE_NAMES = [ + 'EEG_Channel_Count', + 'EEG_Rel_Delta', + 'EEG_Rel_Theta', + 'EEG_Rel_Alpha', + 'EEG_Rel_Sigma', + 'EEG_Rel_Beta', + 'EEG_Theta_Alpha_Ratio', + 'EEG_Hjorth_Complexity', +] +EEG_FEATURE_LENGTH = len(EEG_FEATURE_NAMES) + + +def _normalize_label(text): + normalized = ''.join(ch if ch.isalnum() else ' ' for ch in str(text).lower()) + return ' '.join(normalized.split()) + + +def _split_aliases(raw_aliases): + return {_normalize_label(alias) for alias in str(raw_aliases).split(';') if alias} + + +def _build_eeg_aliases(channels): + eeg_rows = channels[channels['Category'].eq('eeg')] + aliases = set() + for _, row in eeg_rows.iterrows(): + aliases.update(_split_aliases(row['Channel_Names'])) + return aliases + + +def _resample_signal(signal, fs, target_fs): + signal = np.asarray(signal, dtype=float) + if signal.size == 0: + return signal, target_fs + if fs == target_fs: + return signal, target_fs + + duration = signal.size / fs + target_samples = max(1, int(round(duration * target_fs))) + time_original = np.linspace(0, duration, signal.size) + time_target = np.linspace(0, duration, target_samples) + return np.interp(time_target, time_original, signal), target_fs + + +def _extract_channel_metrics(signal, fs): + signal = np.nan_to_num(np.asarray(signal, dtype=float), nan=0.0, posinf=0.0, neginf=0.0) + if signal.size < max(int(fs * 30), 2): + return None + + if fs != 200: + signal, fs = _resample_signal(signal, fs, 200) + + filtered = EEG_functions.butter_bandpass_filter(signal, lowcut=0.3, highcut=35, fs=fs, order=4) + signal_std = np.std(filtered) + if signal_std == 0 or not np.isfinite(signal_std): + return None + + normalized = (filtered - np.mean(filtered)) / signal_std + epochs = EEG_functions.create_epochs(normalized, fs, epoch_duration=30) + if epochs.size == 0: + return None + + band_powers, complexities = EEG_functions.extract_band_powers(epochs, fs, win_len=15) + if len(band_powers) > 60: + band_powers = band_powers.iloc[60:] + complexities = complexities.iloc[60:] + if band_powers.empty: + return None + + total_power = band_powers.sum(axis=1).replace(0, np.nan) + relative_powers = band_powers.div(total_power, axis=0).replace([np.inf, -np.inf], np.nan).fillna(0.0).mean() + alpha_power = float(relative_powers.get('Alpha', 0.0)) + theta_power = float(relative_powers.get('Theta', 0.0)) + theta_alpha_ratio = theta_power / alpha_power if alpha_power > 0 else 0.0 + + complexity_mean = float( + complexities['Hjorth_Complexity'].replace([np.inf, -np.inf], np.nan).fillna(0.0).mean() + ) if 'Hjorth_Complexity' in complexities else 0.0 + + return np.array([ + float(relative_powers.get('Delta', 0.0)), + theta_power, + alpha_power, + float(relative_powers.get('Sigma', 0.0)), + float(relative_powers.get('Beta', 0.0)), + float(theta_alpha_ratio), + complexity_mean, + ], dtype=np.float32) + +def processEEG(physiological_data, physiological_fs, csv_path): channels = pd.read_csv(csv_path) - selectEEG = channels[channels['Category'].isin(['eeg'])] - - for label in original_labels: - fs = physiological_fs[label] - - data = [] - original_labels = list(physiological_data.keys()) - - # Listar canales para identificar los de interés (ej: C3-M2, O1-M2) - HayEEG = False - for i, label in enumerate(original_labels): - for index in selectEEG.index: - if label.lower() in selectEEG['Channel_Names'][index].lower(): - print(f"Canal seleccionado: {label}") - labels.append(label) - HayEEG = True - break - - results = [] - labels2 = [] - if HayEEG: - Bipolar = pd.DataFrame() - if all(label in labels for label in ["F3", "F4", "M1", "M2"]): - Bipolar['F3-M2'] = physiological_data["F3"] - physiological_data["M2"] - Bipolar['F4-M1'] = physiological_data["F4"] - physiological_data["M1"] - labels2.append('F3-M2') - labels2.append('F4-M1') - if all(label in labels for label in ["C3", "C4", "M1", "M2"]): - Bipolar['C3-M2'] = physiological_data["C3"] - physiological_data["M2"] - Bipolar['C4-M1'] = physiological_data["C4"] - physiological_data["M1"] - labels2.append('C3-M2') - labels2.append('C4-M1') - if all(label in labels for label in ["O2", "O1", "M1", "M2"]): - Bipolar['O2-M2'] = physiological_data["O1"] - physiological_data["M2"] - Bipolar['O1-M1'] = physiological_data["O2"] - physiological_data["M1"] - labels2.append('O1-M1') - labels2.append('O2-M2') - # print(f"Archivo {file} tiene ECG, RESP y EEG. Se procesará con canales bipolares.") - - if not Bipolar.empty: - labels = [] - for col in Bipolar.columns: - # print(f"Archivo: {file}, Canal: {col}, Frecuencia de muestreo: {sig.sampling_frequency} Hz, Duración: {len(Bipolar[col])/sig.sampling_frequency:.2f} segundos") - fs = physiological_data["M2"].sampling_frequency # Asumimos que todos los canales tienen la misma frecuencia de muestreo - fil = EEG_functions.butter_bandpass_filter(Bipolar[col], lowcut=0.3, highcut=35, fs=fs, order=4) - norm = (fil-np.mean(fil))/np.std(fil) - - data.append(norm) # Restar la media para centrar la señal - labels.append(col) - # columns = Bipolar.columns.tolist() - else: - labels = [] - for l in labels: - # print(f"Archivo: {file}, Canal: {sig.label}, Frecuencia de muestreo: {sig.sampling_frequency} Hz, Duración: {len(sig.data)/sig.sampling_frequency:.2f} segundos") - fs = physiological_fs[l] - fil = EEG_functions.butter_bandpass_filter(physiological_data[l], lowcut=0.3, highcut=35, fs=fs, order=4) - norm = (fil-np.mean(fil))/np.std(fil) - labels.append(l) - data.append(norm) # Restar la media para centrar la señal - - # columns = [selEEG[i][1].label for i in range(len(selEEG))] - - - for i, elec in enumerate(labels): - epoch_length = 30 # Duración de cada época en segundos - if Bipolar.empty: - fs = physiological_fs[l] - else: - fs = physiological_fs['M1'] - - if fs != 200: - # print(f"Warning: Sampling frequency for channel {elec} in file {file} is {fs} Hz, expected 200 Hz. Check the data.") - duration = len(data[i]) / fs - time_original = np.linspace(0, duration, len(data[i])) - - num_samples_target = int(duration * 200 ) - time_target = np.linspace(0, duration, num_samples_target) - data[i] = np.interp(time_target, time_original, data[i]) - fs = 200 # Update fs to the target sampling frequency after resampling - - epochs = EEG_functions.create_epochs(data[i], fs, epoch_duration=epoch_length) - - band_powers, complexities = EEG_functions.extract_band_powers(epochs, fs, win_len=15) - band_powers = band_powers.iloc[60:] # Eliminar las primeras 60 épocas (30 min) para evitar el tiempo despierto al inicio de la grabación - - - # Ejecución - patient_summar = EEG_functions.get_patient_profile(band_powers) - - d = complexities.iloc[:].std().to_dict() - results.append({ - 'Channel': elec, - **d, - **patient_summar - }) - df_results = np.array(results) - return df_results \ No newline at end of file + eeg_aliases = _build_eeg_aliases(channels) + channel_metrics = [] + + for label, signal in physiological_data.items(): + if label not in physiological_fs: + continue + if _normalize_label(label) not in eeg_aliases: + continue + + metrics = _extract_channel_metrics(signal, physiological_fs[label]) + if metrics is not None: + channel_metrics.append(metrics) + + if not channel_metrics: + return np.zeros(EEG_FEATURE_LENGTH, dtype=np.float32) + + stacked = np.vstack(channel_metrics) + aggregated = np.mean(stacked, axis=0) + return np.hstack([np.array([len(channel_metrics)], dtype=np.float32), aggregated]).astype(np.float32) \ No newline at end of file diff --git a/src/lib/Resp_features.py b/src/lib/Resp_features.py index 216b3ab..60ff3bf 100644 --- a/src/lib/Resp_features.py +++ b/src/lib/Resp_features.py @@ -176,7 +176,7 @@ def Metrics_per_segment(Data): section = sel_sujeto[secc].values section = section[~np.isnan(section)] - hat_Br, Sk_Br, t_aver = peakedness_application(section, stage=secc, plotflag = False, subjet= subjet) + hat_Br, Sk_Br, t_aver, _ = peakedness_application(section, stage=secc, plotflag = False, subjet= subjet) # print(f"Subjet: {subjet}, section: {secc} hat_Br: {hat_Br}, Sk_Br: {Sk_Br}") # Ajuste lineal diff --git a/src/lib/peakedness.py b/src/lib/peakedness.py index 9c47f96..1bda09c 100644 --- a/src/lib/peakedness.py +++ b/src/lib/peakedness.py @@ -6,17 +6,13 @@ from time import time import os -try: - import matplotlib.pyplot as plt -except ModuleNotFoundError: - plt = None - -try: - import plotly.graph_objs as go - from plotly import subplots -except ModuleNotFoundError: - go = None - subplots = None +def _safe_ratio(numerator, denominator, default=0.0): + if denominator is None or not np.isfinite(denominator) or denominator == 0: + return default + value = numerator / denominator + if np.isfinite(value): + return value + return default def setParamFr(Setup): if 'DT' not in Setup.keys(): @@ -207,7 +203,9 @@ def init_module(kk,vars,param, plotflag): # Peakedness # print(S[Omega]) - Pkl = 100*sum(S[Omega_p])/sum(S[Omega]) + band_power = np.sum(S[Omega]) + peaky_power = np.sum(S[Omega_p]) + Pkl = 100*_safe_ratio(peaky_power, band_power) if Pkl >= ksi_p: Xkl[k,l] = 1 @@ -258,7 +256,8 @@ def init_module(kk,vars,param, plotflag): j_pk = j_pk[~j_del] # Cost function for deviation from previous fr and maximum power - C_a = 1-np.transpose(pk)/np.max(S) + max_s = np.max(S) + C_a = 1-_safe_ratio(np.transpose(pk), max_s, default=np.zeros_like(pk, dtype=float)) fr_prev = vars["bar_fr"][np.max(kk,0)] C_f = abs(f[j_pk[:]]-fr_prev)/(2*d) # C_f = abs(f(i_pk(:))-fr_prev)/(Omega_r(2)-Omega(1)); @@ -273,13 +272,13 @@ def init_module(kk,vars,param, plotflag): # Save in vars # vars["bar_fr"][kk] = f[fj] - if plotflag: - if plt is None: - raise ModuleNotFoundError("matplotlib is required when plotflag=True") - plt.plot(f, averS) - plt.plot(f[fj], averS[fj], '-') - plt.title('Initialization - Averaged Spectrum') - plt.show() + # if plotflag: + # if plt is None: + # raise ModuleNotFoundError("matplotlib is required when plotflag=True") + # plt.plot(f, averS) + # plt.plot(f[fj], averS[fj], '-') + # plt.title('Initialization - Averaged Spectrum') + # plt.show() return vars # # No spectra fulfill the initialization @@ -323,11 +322,15 @@ def compute_Xkl( Skl, f, bar_fr, O, ksi_p, ksi_a, d): S = Skl[:, O[k], l] # % Define peakedness based on the power concentration - Pkl = 100*sum(S[Omega_p])/sum(S[Omega]) + band_power = np.sum(S[Omega]) + peaky_power = np.sum(S[Omega_p]) + Pkl = 100*_safe_ratio(peaky_power, band_power) # % Define peakedness based on the absolute maximum # print(max(S)) - Akl = 100*max(S[Omega])/max(S) + max_s = np.max(S) + max_band = np.max(S[Omega]) if np.any(Omega) else 0.0 + Akl = 100*_safe_ratio(max_band, max_s) # % If the spectrum is concidered peaky by both conditions, mark as # % peaky if np.bitwise_and(Pkl >= ksi_p, Akl >= ksi_a): @@ -474,7 +477,9 @@ def peakednessCost(signals, ts, fs, Setup = {}, title = "", storeGraph = False, vars["t_aver"] = vars["t_orig"][N:-N] if vars["t_aver"].shape[0] == 0: print("No hay tiempo para promediar") - return np.nan, np.nan, np.nan + empty_spectra = np.empty((vars["f"].shape[0], 0)) + empty_used = np.empty((0, vars["L"])) + return np.array([]), empty_spectra, np.array([]), empty_used vars["Sk"] = np.empty((vars["f"].shape[0], vars["t_aver"].shape[0])) vars["Sk"][:] = np.nan vars["bar_fr"] = np.empty(( vars["t_aver"].shape[0])) @@ -615,33 +620,33 @@ def peakednessCost(signals, ts, fs, Setup = {}, title = "", storeGraph = False, vars["t_orig"] = vars["t_orig"] + ts1 t_fin = time() - if plotflag: - if go is None or subplots is None: - raise ModuleNotFoundError("plotly is required when plotflag=True") + # if plotflag: + # if go is None or subplots is None: + # raise ModuleNotFoundError("plotly is required when plotflag=True") - fig = subplots.make_subplots(rows=2,shared_xaxes=True, subplot_titles=('Peak-condition averaged EDR Spectra in '+title,"EDR/RESP signals"), row_heights=[0.7, 0.3]) + # fig = subplots.make_subplots(rows=2,shared_xaxes=True, subplot_titles=('Peak-condition averaged EDR Spectra in '+title,"EDR/RESP signals"), row_heights=[0.7, 0.3]) - fig.add_heatmap(x=vars["t_aver"], y=vars["f"], z=vars["Sk"]/np.max(vars["Sk"]),colorscale='jet',colorbar=dict(orientation='h')) - fig.update_layout(coloraxis_showscale=False) - fig.add_trace(go.Line(x=vars["t_aver"], y=vars["hat_fr"],name = 'f\u0302_r(k)'), row = 1, col=1) - fig.add_trace(go.Line(x=vars["t_aver"],y=vars["bar_fr"],name= 'f\u0304_r(k)'), row = 1, col=1) + # fig.add_heatmap(x=vars["t_aver"], y=vars["f"], z=vars["Sk"]/np.max(vars["Sk"]),colorscale='jet',colorbar=dict(orientation='h')) + # fig.update_layout(coloraxis_showscale=False) + # fig.add_trace(go.Line(x=vars["t_aver"], y=vars["hat_fr"],name = 'f\u0302_r(k)'), row = 1, col=1) + # fig.add_trace(go.Line(x=vars["t_aver"],y=vars["bar_fr"],name= 'f\u0304_r(k)'), row = 1, col=1) - fig.add_trace(go.Line(x=vars["t_aver"],y=vars["used"]), row = 1, col=1) - # fig.axis([vars.t_aver(1), vars.t_aver(end), vars.f(1), vars.f(end)]) - for i in range(signals.shape[1]): - fig.add_trace(go.Line(x=ts+ts1,y=signals[:,i],name = 'Signal '+str(i)), row = 2, col=1) - - fig.update_layout(coloraxis_showscale=False) - fig.update_yaxes(title_text="f (Hz)", row=1, col=1) - fig.update_yaxes(title_text="(n.u.)", row=2, col=1) - fig.update_xaxes(title_text="time (s)", row=2, col=1) - if storeGraph: - os.makedirs("Graphs/Peakedness/"+str(subjet), exist_ok=True) - # fig.write_image(os.path.join("Graphs", "Peakedness",str(subjet),title+".png")) - fig.write_html(os.path.join("Graphs", "Peakedness",str(subjet),title+".html")) - # fig.write_image() - else: - fig.show() - - return vars["hat_fr"], vars["Sk"], vars["t_aver"] + # fig.add_trace(go.Line(x=vars["t_aver"],y=vars["used"]), row = 1, col=1) + # # fig.axis([vars.t_aver(1), vars.t_aver(end), vars.f(1), vars.f(end)]) + # for i in range(signals.shape[1]): + # fig.add_trace(go.Line(x=ts+ts1,y=signals[:,i],name = 'Signal '+str(i)), row = 2, col=1) + + # fig.update_layout(coloraxis_showscale=False) + # fig.update_yaxes(title_text="f (Hz)", row=1, col=1) + # fig.update_yaxes(title_text="(n.u.)", row=2, col=1) + # fig.update_xaxes(title_text="time (s)", row=2, col=1) + # if storeGraph: + # os.makedirs("Graphs/Peakedness/"+str(subjet), exist_ok=True) + # # fig.write_image(os.path.join("Graphs", "Peakedness",str(subjet),title+".png")) + # fig.write_html(os.path.join("Graphs", "Peakedness",str(subjet),title+".html")) + # # fig.write_image() + # else: + # fig.show() + + return vars["hat_fr"], vars["Sk"], vars["t_aver"], vars["used"] # return vars["hat_fr"], vars["Sk"], vars["bar_fr"],vars["t_aver"], vars["f"], vars["used"] \ No newline at end of file diff --git a/src/resp_processing.py b/src/resp_processing.py index c158d2b..c1e7845 100644 --- a/src/resp_processing.py +++ b/src/resp_processing.py @@ -1,138 +1,171 @@ from .lib import Resp_features -import sys +import sys import os import pandas as pd import numpy as np sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -def processResp(physiological_data, physiological_fs, csv_path): +RESP_CHANNEL_GROUPS = ("Abdomen", "Chest", "Nasal", "Flow") +RESP_FEATURE_NAMES = [ + f"{group}_Peakedness_{metric}" + for group in RESP_CHANNEL_GROUPS + for metric in ("Max", "Min", "Mean", "Median", "Std") +] + [ + "SpO2_Max", + "SpO2_Min", + "SpO2_Mean", + "SpO2_Std", + "CET90", + "ODI_Mean", + "ODI_deepness", +] +RESP_FEATURE_LENGTH = len(RESP_FEATURE_NAMES) + + +def _normalize_label(text): + normalized = ''.join(ch if ch.isalnum() else ' ' for ch in str(text).lower()) + return ' '.join(normalized.split()) + + +def _split_aliases(raw_aliases): + return {_normalize_label(alias) for alias in str(raw_aliases).split(';') if alias} + + +def _build_resp_alias_groups(channels): + resp_rows = channels[channels['Category'].eq('resp')].reset_index(drop=True) + if len(resp_rows) < 7: + return {} + return { + 'Abdomen': _split_aliases(resp_rows.iloc[0]['Channel_Names']), + 'Chest': _split_aliases(resp_rows.iloc[1]['Channel_Names']), + 'Nasal': _split_aliases(resp_rows.iloc[2]['Channel_Names']), + 'Flow': _split_aliases(resp_rows.iloc[3]['Channel_Names']), + 'SpO2': _split_aliases(resp_rows.iloc[6]['Channel_Names']), + } + + +def _find_resp_group(label, alias_groups): + normalized = _normalize_label(label) + for group_name, aliases in alias_groups.items(): + if normalized in aliases: + return group_name + return None + + +def _resample_signal(signal, fs, target_fs): + signal = np.asarray(signal, dtype=float) + if signal.size == 0: + return signal, target_fs + if fs == target_fs: + return signal, target_fs + + duration = signal.size / fs + target_samples = max(1, int(round(duration * target_fs))) + time_original = np.linspace(0, duration, signal.size) + time_target = np.linspace(0, duration, target_samples) + return np.interp(time_target, time_original, signal), target_fs + + +def _compute_resp_quality(used, hat_br): + used_array = np.asarray(used, dtype=float) + if used_array.size: + quality = float(np.nanmean(used_array)) + if np.isfinite(quality): + return quality + hat_br = np.asarray(hat_br, dtype=float) + if hat_br.size == 0: + return 0.0 + return float(np.mean(np.isfinite(hat_br))) + + +def _summarize_peakedness(hat_br): + finite_values = np.asarray(hat_br, dtype=float) + finite_values = finite_values[np.isfinite(finite_values)] + if finite_values.size == 0: + return None + return { + 'Max': float(np.max(finite_values)), + 'Min': float(np.min(finite_values)), + 'Mean': float(np.mean(finite_values)), + 'Median': float(np.median(finite_values)), + 'Std': float(np.std(finite_values)), + } + +def _summarize_spo2(data, fs): + if data.size == 0: + return {} + working = np.asarray(data, dtype=float).copy() + if np.nanmax(working) < 2: + working = np.round((working / 1.055) * 100) + + desaturation_mask = working.copy() + threshold = 0.7 + for index, value in enumerate(working): + if value < threshold: + start = int(max(0, index - fs * 2)) + end = int(min(working.size, index + fs * 2)) + desaturation_mask[start:end] = np.nan + + cet90 = float(np.count_nonzero(desaturation_mask < 90) / max(working.size, 1)) + valid = desaturation_mask[np.isfinite(desaturation_mask)] + if valid.size == 0: + return {'CET90': cet90} + + odi_mean, odi_deepness = Resp_features.ODI_application(desaturation_mask, fs, plotflag=False, subjet=1) + return { + 'SpO2_Max': float(np.max(valid)), + 'SpO2_Min': float(np.min(valid)), + 'SpO2_Mean': float(np.mean(valid)), + 'SpO2_Std': float(np.std(valid)), + 'CET90': cet90, + 'ODI_Mean': float(odi_mean), + 'ODI_deepness': float(odi_deepness), + } + + +def processResp(physiological_data, physiological_fs, csv_path): channels = pd.read_csv(csv_path) - selectResp = channels[channels['Category'].isin(['resp'])] - - resultados = {} - UsedFlow = 0 - UsedChest = 0 - UsedAbdomen = 0 - UsedSpO2 = 0 - UsedNasal = 0 - UsedCepap = 0 - - data = [] - original_labels = list(physiological_data.keys()) - - for label in original_labels: - fs = physiological_fs[label] - sig = physiological_data[label] - if fs != 25: - duration = len(sig) / fs - time_original = np.linspace(0, duration, len(sig)) - num_samples_target = int(duration * 25 ) - time_target = np.linspace(0, duration, num_samples_target) - data = np.interp(time_target, time_original, sig) - fs = 25 # Update fs to the target sampling frequency after resampling - else: - data = sig - - # Check nan in sig.data - if np.isnan(sig).any(): - print(f"Warning: NaN values found in signal data for {label}. Filling NaNs with zeros.") - data = np.nan_to_num(data) - - name = "" - if label.lower() not in selectResp['Channel_Names'][34].lower(): - d = Resp_features.peakedness_application(data, stage=label, plotflag = False, subjet =label) - if label.lower() in selectResp['Channel_Names'][28].lower(): - name = "Chest" - # EFFORT RESPIRATORY Chest - elif label.lower() in selectResp['Channel_Names'][29].lower(): - # EFFORT RESPIRATORY Abdomen - name = "Abdomen" - elif label.lower() in selectResp['Channel_Names'][30].lower(): - # RESPIRATORY NASAL - name = "Nasal" - elif label.lower() in selectResp['Channel_Names'][31].lower(): - # RESPIRATORY FLOW - name = "Flow" - elif label.lower() in selectResp['Channel_Names'][32].lower(): - # CEPAP - if np.all(data == 0) or np.std(data) < 5: - print(f"Warning: All values in the signal data for {label} are zero. Skipping feature extraction for this channel.") - else: - name = "" - elif label.lower() in selectResp['Channel_Names'][33].lower(): - # CEPAP - name = "" - - if name != "": - DSinNan = d[0][~np.isnan(d[0])] # Eliminar NaN antes de calcular min y max - if len(DSinNan) != 0: - maximo = DSinNan.max() - minimo = DSinNan.min() - media = np.mean(DSinNan) - mediana = np.median(DSinNan) - std = DSinNan.std() - write = False - if name == "Nasal" and UsedNasal< d[-1]: - UsedNasal = d[-1] - write = True - elif name == "Chest" and UsedChest< d[-1]: - UsedChest = d[-1] - write = True - elif name == "Abdomen" and UsedAbdomen< d[-1]: - UsedAbdomen = d[-1] - write = True - elif name == "Flow" and UsedFlow< d[-1]: - UsedFlow = d[-1] - write = True - elif name == "SpO2" and UsedSpO2< d[-1]: - UsedSpO2 = d[-1] - write = True - elif name == "CEPAP" and UsedCepap < d[-1]: - UsedCepap = d[-1] - write = True - if write: - resultados.update({ - name+"_Peakedness_Max": maximo, - name+"_Peakedness_Min": minimo, - name+"_Peakedness_Mean": media, - name+"_Peakedness_Median": mediana, - name+"_Peakedness_Std": std - }) - - elif label.lower() in selectResp['Channel_Names'][34].lower(): - #O2 SATURATION - if np.max(data) < 2: - data = np.round((data/1.055)*100) - - lim = 0.7 - # Quitar los valores por debajo de lim y sus 10 valores anteriores y posteriores para quedarnos solo con los eventos de desaturación - dataReal = data.copy() - for i in range(len(data)): - if data[i] < lim: - start = int(max(0, i-fs*2)) - end = int(min(len(data), i+fs*2)) - dataReal[start:end] = np.nan # Marcar los valores por debajo del límite y sus alrededores como NaN - - CET90 = dataReal[dataReal < 90] - # CET90SinNan = CET90[~np.isnan(CET90)] # Eliminar NaN antes de calcular min y max - CET90 = len(CET90)/len(data) - dataRealSinNan = dataReal[~np.isnan(dataReal)] # Eliminar NaN antes de calcular min y max - if len(dataRealSinNan)>0: - maximo = dataRealSinNan.max() - minimo = dataRealSinNan.min() - std = dataRealSinNan.std() - media = dataRealSinNan.mean() - ODI_mean, ODI_deepness = Resp_features.ODI_application(dataReal, fs, plotflag=False, subjet=1) - - resultados.update({"SpO2_Max": maximo, - "SpO2_Min": minimo, - "SpO2_Mean": media, - "SpO2_Std": std, - "CET90": CET90, - "ODI_Mean": ODI_mean, - "ODI_deepness": ODI_deepness, - }) - - return np.array(resultados) + alias_groups = _build_resp_alias_groups(channels) + results = {feature_name: 0.0 for feature_name in RESP_FEATURE_NAMES} + best_quality = {group_name: -np.inf for group_name in RESP_CHANNEL_GROUPS} + + for label, signal in physiological_data.items(): + if label not in physiological_fs: + continue + + group_name = _find_resp_group(label, alias_groups) + if group_name is None: + continue + + resampled, fs = _resample_signal(signal, physiological_fs[label], 25) + resampled = np.nan_to_num(resampled, nan=0.0, posinf=0.0, neginf=0.0) + + if group_name == 'SpO2': + results.update(_summarize_spo2(resampled, fs)) + continue + + try: + hat_br, _, _, used = Resp_features.peakedness_application( + resampled, + stage=label, + plotflag=False, + subjet=label, + ) + except Exception: + continue + + summary = _summarize_peakedness(hat_br) + if summary is None: + continue + + quality = _compute_resp_quality(used, hat_br) + if quality <= best_quality[group_name]: + continue + + best_quality[group_name] = quality + for metric_name, metric_value in summary.items(): + results[f'{group_name}_Peakedness_{metric_name}'] = metric_value + + return np.array([results[name] for name in RESP_FEATURE_NAMES], dtype=np.float32) From 616a1b80cce6999860afdf0585f6e81b8bef3261 Mon Sep 17 00:00:00 2001 From: sromagnoli-10 Date: Sat, 28 Mar 2026 09:34:18 +0100 Subject: [PATCH 34/38] Update openECGfunction.py --- src/lib/openECGfunction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib/openECGfunction.py b/src/lib/openECGfunction.py index 745b1ac..2710531 100644 --- a/src/lib/openECGfunction.py +++ b/src/lib/openECGfunction.py @@ -17,7 +17,7 @@ def openECG(physiological_data_file, patient_id): # Check if any ECG keyword is inside the label if any(keyword in label_clean for keyword in ecg_keywords): idx = i - break # ✅ first ECG channel only + break #first ECG channel only if idx is None: raise ValueError("No ECG channel found") @@ -36,4 +36,4 @@ def openECG(physiological_data_file, patient_id): [all_patients_ECGresults, all_results], ignore_index=True ) - return all_patients_ECGresults \ No newline at end of file + return all_patients_ECGresults From 30e2369dbde4dac196bcc8fe1bc8c35873175c7e Mon Sep 17 00:00:00 2001 From: sromagnoli-10 Date: Sat, 28 Mar 2026 09:40:13 +0100 Subject: [PATCH 35/38] Update team_code.py --- team_code.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/team_code.py b/team_code.py index 7296cc1..9f9efb7 100644 --- a/team_code.py +++ b/team_code.py @@ -24,6 +24,7 @@ from helper_code import * from src.resp_processing import processResp from src.eeg_processing import processEEG +from lib.openECGfunction import ################################################################################ # Path & Constant Configuration (Added for Robustness) ################################################################################ @@ -668,4 +669,4 @@ def count_discrete_events(key): def save_model(model_folder, model): d = {'model': model} filename = os.path.join(model_folder, 'model.sav') - joblib.dump(d, filename, protocol=0) \ No newline at end of file + joblib.dump(d, filename, protocol=0) From fe8e28bfdb3d134686fc0e332a8b3ed5f456cb37 Mon Sep 17 00:00:00 2001 From: sromagnoli-10 Date: Sat, 28 Mar 2026 09:52:33 +0100 Subject: [PATCH 36/38] Update team_code.py --- team_code.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/team_code.py b/team_code.py index 9f9efb7..740983f 100644 --- a/team_code.py +++ b/team_code.py @@ -24,7 +24,7 @@ from helper_code import * from src.resp_processing import processResp from src.eeg_processing import processEEG -from lib.openECGfunction import +from lib.openECGfunction import openECG ################################################################################ # Path & Constant Configuration (Added for Robustness) ################################################################################ From 112aae078308f9304c7cfa993aaa94433fa1f709 Mon Sep 17 00:00:00 2001 From: dcajal Date: Sat, 28 Mar 2026 10:04:52 +0100 Subject: [PATCH 37/38] Fix invalid value encounteres in sacalar divide --- src/lib/EEG_functions.py | 140 +++++++++++++++++++++------------------ src/lib/peakedness.py | 24 +++---- 2 files changed, 87 insertions(+), 77 deletions(-) diff --git a/src/lib/EEG_functions.py b/src/lib/EEG_functions.py index 83e8ef6..8c7137c 100644 --- a/src/lib/EEG_functions.py +++ b/src/lib/EEG_functions.py @@ -5,17 +5,27 @@ from scipy import signal from scipy.stats import kurtosis, entropy -try: - import plotly.graph_objects as go - from plotly.subplots import make_subplots -except ModuleNotFoundError: - go = None - make_subplots = None +# try: +# import plotly.graph_objects as go +# from plotly.subplots import make_subplots +# except ModuleNotFoundError: +# go = None +# make_subplots = None -try: - import matplotlib.pyplot as plt -except ModuleNotFoundError: - plt = None +# try: +# import matplotlib.pyplot as plt +# except ModuleNotFoundError: +# plt = None + +def _safe_sqrt_variance_ratio(numerator_signal, denominator_signal): + numerator_var = np.var(numerator_signal) + denominator_var = np.var(denominator_signal) + if denominator_var <= 0 or not np.isfinite(denominator_var): + return 0.0 + ratio = numerator_var / denominator_var + if ratio <= 0 or not np.isfinite(ratio): + return 0.0 + return float(np.sqrt(ratio)) def butter_bandpass_filter(data, lowcut, highcut, fs, order=4): nyq = 0.5 * fs # Frecuencia de Nyquist @@ -40,66 +50,66 @@ def butter_bandpass_filter(data, lowcut, highcut, fs, order=4): y = filtfilt(b, a, data) return y -def plot_EEG(df, columns, fs = 200): - if go is None or make_subplots is None: - raise ModuleNotFoundError("plotly is required for plot_EEG") +# def plot_EEG(df, columns, fs = 200): +# if go is None or make_subplots is None: +# raise ModuleNotFoundError("plotly is required for plot_EEG") - fig = make_subplots(rows=len(columns), cols=1, - shared_xaxes=True, - vertical_spacing=0.02, - subplot_titles=columns) - limit = int(3000 * fs) - x = np.arange(df[0].shape[0]) / fs # Asumiendo fs=100Hz, ajusta si es diferente - downsample = 10 # Factor de downsampling para mejorar rendimiento (ajusta según necesidad) - for i, col in enumerate(columns): - fig.add_trace( - go.Scattergl(x=x[:limit:downsample], y=df[i][:limit:downsample], name=col, mode='lines'), - row=i+1, col=1 - ) - fig.update_layout( - height=900, - title_text="Polisomnografía - Canales EEG", - showlegend=False, - template="plotly_white" - ) - fig.update_xaxes(title_text="Tiempo (segundos)", row=len(columns), col=1) - fig.show() +# fig = make_subplots(rows=len(columns), cols=1, +# shared_xaxes=True, +# vertical_spacing=0.02, +# subplot_titles=columns) +# limit = int(3000 * fs) +# x = np.arange(df[0].shape[0]) / fs # Asumiendo fs=100Hz, ajusta si es diferente +# downsample = 10 # Factor de downsampling para mejorar rendimiento (ajusta según necesidad) +# for i, col in enumerate(columns): +# fig.add_trace( +# go.Scattergl(x=x[:limit:downsample], y=df[i][:limit:downsample], name=col, mode='lines'), +# row=i+1, col=1 +# ) +# fig.update_layout( +# height=900, +# title_text="Polisomnografía - Canales EEG", +# showlegend=False, +# template="plotly_white" +# ) +# fig.update_xaxes(title_text="Tiempo (segundos)", row=len(columns), col=1) +# fig.show() -def plot_EEG_sel(sel, name = "EEG_plot_raw.html"): - if go is None or make_subplots is None: - raise ModuleNotFoundError("plotly is required for plot_EEG_sel") +# def plot_EEG_sel(sel, name = "EEG_plot_raw.html"): +# if go is None or make_subplots is None: +# raise ModuleNotFoundError("plotly is required for plot_EEG_sel") - fig = make_subplots(rows=len(sel), cols=1, - shared_xaxes=True, - vertical_spacing=0.02, - subplot_titles=[ch[1].label for ch in sel]) +# fig = make_subplots(rows=len(sel), cols=1, +# shared_xaxes=True, +# vertical_spacing=0.02, +# subplot_titles=[ch[1].label for ch in sel]) - for i, (idx, sig) in enumerate(sel): - # Crear eje de tiempo en segundos - fs = sig.sampling_frequency - time = np.linspace(0, len(sig.data) / fs, len(sig.data)) +# for i, (idx, sig) in enumerate(sel): +# # Crear eje de tiempo en segundos +# fs = sig.sampling_frequency +# time = np.linspace(0, len(sig.data) / fs, len(sig.data)) - # Añadir traza (solo mostramos los primeros 30s por defecto para no saturar el navegador) - # Puedes quitar el slice [:int(30*fs)] para ver todo, pero cuidado con el rendimiento - limit = int(3000 * fs) - # limit = len(sig.data) if limit > len(sig.data) else limit - # limit = len(sig.data) - # downsample = 10 # Factor de downsampling para mejorar rendimiento (ajusta según necesidad) - fig.add_trace( - go.Scattergl(x=time[:limit], y=sig.data[:limit], name=sig.label, mode='lines'), - row=i+1, col=1 - ) +# # Añadir traza (solo mostramos los primeros 30s por defecto para no saturar el navegador) +# # Puedes quitar el slice [:int(30*fs)] para ver todo, pero cuidado con el rendimiento +# limit = int(3000 * fs) +# # limit = len(sig.data) if limit > len(sig.data) else limit +# # limit = len(sig.data) +# # downsample = 10 # Factor de downsampling para mejorar rendimiento (ajusta según necesidad) +# fig.add_trace( +# go.Scattergl(x=time[:limit], y=sig.data[:limit], name=sig.label, mode='lines'), +# row=i+1, col=1 +# ) - fig.update_layout( - height=900, - title_text="Polisomnografía - Canales EEG", - showlegend=False, - template="plotly_white" - ) +# fig.update_layout( +# height=900, +# title_text="Polisomnografía - Canales EEG", +# showlegend=False, +# template="plotly_white" +# ) - fig.update_xaxes(title_text="Tiempo (segundos)", row=len(sel), col=1) - fig.write_html(f"graphs/{name}.html") # Guardar como HTML para visualización interactiva - # fig.show() +# fig.update_xaxes(title_text="Tiempo (segundos)", row=len(sel), col=1) +# fig.write_html(f"graphs/{name}.html") # Guardar como HTML para visualización interactiva +# # fig.show() def filtering_and_normalization(sig, sig_fs): b, a = signal.butter(4, 0.3, btype='highpass', fs=sig_fs) @@ -249,10 +259,10 @@ def extract_band_powers(epochs, fs, win_len = 2): features.append(epoch_features) diff = np.diff(epoch) - mobility = np.sqrt(np.var(diff) / np.var(epoch)) + mobility = _safe_sqrt_variance_ratio(diff, epoch) # 2. Complejidad de Hjorth: Qué tan similar es la señal a una onda senoidal diff2 = np.diff(diff) - mobility_diff = np.sqrt(np.var(diff2) / np.var(diff)) + mobility_diff = _safe_sqrt_variance_ratio(diff2, diff) complexity = mobility_diff / mobility if mobility > 0 else 0 complexities.append({'Hjorth_Mobility': mobility, 'Hjorth_Complexity': complexity}) diff --git a/src/lib/peakedness.py b/src/lib/peakedness.py index 1bda09c..eaf224a 100644 --- a/src/lib/peakedness.py +++ b/src/lib/peakedness.py @@ -116,7 +116,7 @@ def extract_interval( x, t, int_ini, int_end ): return [ x_int, t_int ] -def normalizar_PSD( PSD, f = 'default', rango = 'default'): +def normalizar_PSD( PSD, f = None, rango = None): # NORMALIZAR_PSD Normaliza una densidad espectral de potencia en el rango # de frecuencias requerido. # @@ -131,17 +131,17 @@ def normalizar_PSD( PSD, f = 'default', rango = 'default'): # f_PSD_norm = Vector de frecuencias para PSD_norm # factor_norm = Factor de normalizaci�n utilizado - if f == 'default': + if f is None: f = np.arange(0,PSD.shape[0]) / PSD.shape[0] - 1/2 - if rango == 'default': + if rango is None: rango = [f[0], f[-1]] # Seleccionar rango de inter�s: f_PSD_norm = f[(f>=rango[0]) & (f<=rango[1])] PSD = PSD[(f>=rango[0]) & (f<=rango[1])] - if ~f_PSD_norm.any(): # El vector de frecuencias no estaba ordenado + if not np.any(f_PSD_norm): # El vector de frecuencias no estaba ordenado print('El vector de frecuencias debe estar ordenado de forma ascendente'); @@ -257,17 +257,20 @@ def init_module(kk,vars,param, plotflag): # Cost function for deviation from previous fr and maximum power max_s = np.max(S) - C_a = 1-_safe_ratio(np.transpose(pk), max_s, default=np.zeros_like(pk, dtype=float)) + if np.isfinite(max_s) and max_s != 0: + C_a = 1 - (np.transpose(pk) / max_s) + else: + C_a = np.ones_like(pk, dtype=float) fr_prev = vars["bar_fr"][np.max(kk,0)] C_f = abs(f[j_pk[:]]-fr_prev)/(2*d) # C_f = abs(f(i_pk(:))-fr_prev)/(Omega_r(2)-Omega(1)); C = C_a +C_f - try: + if C.size > 0: j_min = C.argmin() fj = j_pk[j_min] vars["bar_fr"][kk] = f[fj] - except: + else: vars["bar_fr"][kk] = 0 # Save in vars # vars["bar_fr"][kk] = f[fj] @@ -592,11 +595,8 @@ def peakednessCost(signals, ts, fs, Setup = {}, title = "", storeGraph = False, if np.isnan(vars["hat_fr"][0]) and int_e.shape[0]>1: int_e = int_e[1:] - try: - if (int_e[0]-int_b[0])[0] < 0: - int_e = int_e[1:] - except: - print("int vacio") + if int_b.size > 0 and int_e.size > 0 and (int_e[0]-int_b[0])[0] < 0: + int_e = int_e[1:] int_small = (int_e-int_b)<=(N_k-1) From 866d466416b21ff2cf10731f59e8980f1041e275 Mon Sep 17 00:00:00 2001 From: sromagnoli-10 Date: Sat, 28 Mar 2026 10:54:18 +0100 Subject: [PATCH 38/38] creating ECG processing branch, changing folder of two functions and rename files --- src/{main_ECG_ver2.py => lib/ECG_processing.py} | 0 src/{lib => }/openECGfunction.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename src/{main_ECG_ver2.py => lib/ECG_processing.py} (100%) rename src/{lib => }/openECGfunction.py (95%) diff --git a/src/main_ECG_ver2.py b/src/lib/ECG_processing.py similarity index 100% rename from src/main_ECG_ver2.py rename to src/lib/ECG_processing.py diff --git a/src/lib/openECGfunction.py b/src/openECGfunction.py similarity index 95% rename from src/lib/openECGfunction.py rename to src/openECGfunction.py index 2710531..64c72e7 100644 --- a/src/lib/openECGfunction.py +++ b/src/openECGfunction.py @@ -1,5 +1,5 @@ import pyedflib -from ..main_ECG_ver2 import ECGprocessing +from .lib.ECG_processing import ECGprocessing import pandas as pd def openECG(physiological_data_file, patient_id):