diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6552369..1cb9d57 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -84,6 +84,21 @@ Language support is split between two layers: 4. Add a test case in `tests/test_pipeline.c` for integration-level fixes 5. Verify with a real open-source repo +### Infrastructure Languages (Infra-Pass Pattern) + +Languages like **Dockerfile**, **docker-compose**, **Kubernetes manifests**, and **Kustomize** do not require a new tree-sitter grammar. Instead they follow an *infra-pass* pattern, reusing the existing tree-sitter YAML grammar where applicable: + +1. **Detection helpers** in `src/pipeline/pass_infrascan.c` — functions like `cbm_is_dockerfile()`, `cbm_is_k8s_manifest()`, `cbm_is_kustomize_file()` identify files by name and/or content heuristics (e.g., presence of `apiVersion:`). +2. **Custom extractors** in `internal/cbm/extract_k8s.c` — tree-sitter-based parsers that walk the YAML AST (using the tree-sitter YAML grammar) and populate `CBMFileResult` with imports and definitions. +3. **Pipeline pass** (`pass_k8s.c`, `pass_infrascan.c`) — calls the extractor and emits graph nodes/edges. K8s manifests emit `Resource` nodes; Kustomize files emit `Module` nodes with `IMPORTS` edges to referenced resource files. + +**When adding a new infrastructure language:** +- Add a detection helper (`cbm_is__file()`) in `pass_infrascan.c` or a new `pass_.c`. +- Add the `CBM_LANG_` enum value in `internal/cbm/cbm.h` and a row in the language table in `lang_specs.c`. +- Write a custom extractor that returns `CBMFileResult*` — do not add a tree-sitter grammar. +- Register the pass in `pipeline.c`. +- Add tests in `tests/test_pipeline.c` following the `TEST(infra_is_dockerfile)` and `TEST(k8s_extract_manifest)` patterns. + ## Pull Request Guidelines - **C code only** — this project was rewritten from Go to pure C in v0.5.0. Go PRs will be acknowledged and potentially ported, but cannot be merged directly. diff --git a/Makefile.cbm b/Makefile.cbm index 82821b8..ecccd3a 100644 --- a/Makefile.cbm +++ b/Makefile.cbm @@ -115,6 +115,7 @@ EXTRACTION_SRCS = \ $(CBM_DIR)/extract_type_refs.c \ $(CBM_DIR)/extract_type_assigns.c \ $(CBM_DIR)/extract_env_accesses.c \ + $(CBM_DIR)/extract_k8s.c \ $(CBM_DIR)/helpers.c \ $(CBM_DIR)/lang_specs.c @@ -176,6 +177,7 @@ PIPELINE_SRCS = \ src/pipeline/pass_envscan.c \ src/pipeline/pass_compile_commands.c \ src/pipeline/pass_infrascan.c \ + src/pipeline/pass_k8s.c \ src/pipeline/httplink.c # Traces module (new) diff --git a/README.md b/README.md index 4b28c57..2aff6fa 100644 --- a/README.md +++ b/README.md @@ -269,7 +269,7 @@ codebase-memory-mcp cli --raw search_graph '{"label": "Function"}' | jq '.result ### Node Labels -`Project`, `Package`, `Folder`, `File`, `Module`, `Class`, `Function`, `Method`, `Interface`, `Enum`, `Type`, `Route` +`Project`, `Package`, `Folder`, `File`, `Module`, `Class`, `Function`, `Method`, `Interface`, `Enum`, `Type`, `Route`, `Resource` ### Edge Types diff --git a/internal/cbm/cbm.c b/internal/cbm/cbm.c index 6162c7a..5b70d18 100644 --- a/internal/cbm/cbm.c +++ b/internal/cbm/cbm.c @@ -316,6 +316,11 @@ CBMFileResult *cbm_extract_file(const char *source, int source_len, CBMLanguage cbm_extract_imports(&ctx); cbm_extract_unified(&ctx); + // K8s / Kustomize semantic pass (additional structured extraction for YAML-based infra files). + if (ctx.language == CBM_LANG_KUSTOMIZE || ctx.language == CBM_LANG_K8S) { + cbm_extract_k8s(&ctx); + } + // LSP type-aware call resolution uint64_t lsp_start = now_ns(); if (language == CBM_LANG_GO) { diff --git a/internal/cbm/cbm.h b/internal/cbm/cbm.h index 16b9dd0..6b49ae7 100644 --- a/internal/cbm/cbm.h +++ b/internal/cbm/cbm.h @@ -75,6 +75,8 @@ typedef enum { CBM_LANG_FORM, CBM_LANG_MAGMA, CBM_LANG_WOLFRAM, + CBM_LANG_KUSTOMIZE, // kustomization.yaml — Kubernetes overlay tool + CBM_LANG_K8S, // Generic Kubernetes manifest (apiVersion: detected) CBM_LANG_COUNT } CBMLanguage; @@ -361,4 +363,7 @@ void cbm_extract_type_assigns(CBMExtractCtx *ctx); // Single-pass unified extraction (replaces the 7 calls above except defs+imports). void cbm_extract_unified(CBMExtractCtx *ctx); +// K8s / Kustomize semantic extractor (called when language is CBM_LANG_K8S or CBM_LANG_KUSTOMIZE). +void cbm_extract_k8s(CBMExtractCtx *ctx); + #endif // CBM_H diff --git a/internal/cbm/extract_k8s.c b/internal/cbm/extract_k8s.c new file mode 100644 index 0000000..26bbb69 --- /dev/null +++ b/internal/cbm/extract_k8s.c @@ -0,0 +1,290 @@ +// extract_k8s.c — K8s manifest and Kustomize file extractor. +// +// For CBM_LANG_KUSTOMIZE: walks top-level block_mapping_pair nodes whose key +// matches "resources", "bases", "patches", "components", or +// "patchesStrategicMerge", then emits one CBMImport per block_sequence item. +// +// For CBM_LANG_K8S: finds apiVersion, kind, and metadata.name scalars in the +// first document's block_mapping and emits one CBMDefinition with label +// "Resource" and name "Kind/metadata-name". + +#include "cbm.h" +#include "arena.h" +#include "helpers.h" +#include "tree_sitter/api.h" +#include +#include +#include + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +// Return the raw source text for a scalar node (plain, single-quoted, or +// double-quoted). Surrounding quote characters are stripped for quoted forms. +// Handles flow_node wrappers transparently by descending into the first named +// child (the tree-sitter YAML grammar often wraps scalars in flow_node). +// Returns NULL for non-scalar node types. +static const char *get_scalar_text(CBMArena *a, TSNode node, const char *source) { + const char *type = ts_node_type(node); + // Unwrap flow_node: the actual scalar is the first named child + if (strcmp(type, "flow_node") == 0) { + TSNode inner = ts_node_named_child(node, 0); + if (ts_node_is_null(inner)) { + return NULL; + } + return get_scalar_text(a, inner, source); + } + if (strcmp(type, "plain_scalar") == 0) { + return cbm_node_text(a, node, source); + } + if (strcmp(type, "double_quote_scalar") == 0 || strcmp(type, "single_quote_scalar") == 0) { + const char *raw = cbm_node_text(a, node, source); + if (!raw) { + return NULL; + } + size_t len = strlen(raw); + if (len >= 2) { + return cbm_arena_strndup(a, raw + 1, len - 2); + } + return raw; + } + return NULL; +} + +// Return true if the key text of a block_mapping_pair matches one of the +// Kustomize resource-list field names. +static int is_kustomize_list_key(const char *key) { + return (strcmp(key, "resources") == 0 || strcmp(key, "bases") == 0 || + strcmp(key, "patches") == 0 || strcmp(key, "components") == 0 || + strcmp(key, "patchesStrategicMerge") == 0 || strcmp(key, "crds") == 0); +} + +// --------------------------------------------------------------------------- +// Kustomize extraction +// --------------------------------------------------------------------------- + +// Walk a block_sequence node and emit one CBMImport per block_sequence_item +// scalar child, using key_name as the local_name. +static void emit_kustomize_sequence(CBMExtractCtx *ctx, TSNode seq_node, const char *key_name) { + CBMArena *a = ctx->arena; + uint32_t n = ts_node_child_count(seq_node); + for (uint32_t i = 0; i < n; i++) { + TSNode item = ts_node_child(seq_node, i); + if (strcmp(ts_node_type(item), "block_sequence_item") != 0) { + continue; + } + // block_sequence_item has one named child: the value + uint32_t ic = ts_node_child_count(item); + for (uint32_t j = 0; j < ic; j++) { + TSNode val = ts_node_child(item, j); + const char *scalar = get_scalar_text(a, val, ctx->source); + if (!scalar) { + continue; + } + CBMImport imp = { + .local_name = cbm_arena_strdup(a, key_name), + .module_path = cbm_arena_strdup(a, scalar), + }; + cbm_imports_push(&ctx->result->imports, a, imp); + } + } +} + +static void extract_kustomize(CBMExtractCtx *ctx) { + CBMArena *a = ctx->arena; + + // Traverse: stream -> document -> block_node -> block_mapping -> block_mapping_pair + TSNode root = ctx->root; + uint32_t root_n = ts_node_child_count(root); + for (uint32_t si = 0; si < root_n; si++) { + TSNode stream_child = ts_node_child(root, si); + if (strcmp(ts_node_type(stream_child), "document") != 0) { + continue; + } + // Find block_mapping inside the document (may be wrapped in block_node) + TSNode mapping = ts_node_named_child(stream_child, 0); + if (ts_node_is_null(mapping)) { + continue; + } + // Some grammars wrap in block_node + if (strcmp(ts_node_type(mapping), "block_node") == 0) { + mapping = ts_node_named_child(mapping, 0); + } + if (ts_node_is_null(mapping) || strcmp(ts_node_type(mapping), "block_mapping") != 0) { + continue; + } + + uint32_t pair_n = ts_node_child_count(mapping); + for (uint32_t pi = 0; pi < pair_n; pi++) { + TSNode pair = ts_node_child(mapping, pi); + if (strcmp(ts_node_type(pair), "block_mapping_pair") != 0) { + continue; + } + + // First named child = key + TSNode key_node = ts_node_named_child(pair, 0); + if (ts_node_is_null(key_node)) { + continue; + } + const char *key_text = get_scalar_text(a, key_node, ctx->source); + if (!key_text || !is_kustomize_list_key(key_text)) { + continue; + } + + // Second named child = value (should be a block_sequence or block_node wrapping one) + TSNode val_node = ts_node_named_child(pair, 1); + if (ts_node_is_null(val_node)) { + continue; + } + if (strcmp(ts_node_type(val_node), "block_node") == 0) { + val_node = ts_node_named_child(val_node, 0); + } + if (ts_node_is_null(val_node) || + strcmp(ts_node_type(val_node), "block_sequence") != 0) { + continue; + } + + emit_kustomize_sequence(ctx, val_node, key_text); + } + } +} + +// --------------------------------------------------------------------------- +// K8s manifest extraction +// --------------------------------------------------------------------------- + +// Descend into the first block_mapping of a document and extract apiVersion, +// kind, and metadata.name. Returns void; fills kind_buf and meta_name_buf. +static void extract_k8s_scalars(CBMExtractCtx *ctx, TSNode mapping, char *kind_buf, size_t kind_sz, + char *meta_name_buf, size_t meta_sz) { + CBMArena *a = ctx->arena; + kind_buf[0] = '\0'; + meta_name_buf[0] = '\0'; + + uint32_t n = ts_node_child_count(mapping); + for (uint32_t i = 0; i < n; i++) { + TSNode pair = ts_node_child(mapping, i); + if (strcmp(ts_node_type(pair), "block_mapping_pair") != 0) { + continue; + } + TSNode key_node = ts_node_named_child(pair, 0); + if (ts_node_is_null(key_node)) { + continue; + } + const char *key = get_scalar_text(a, key_node, ctx->source); + if (!key) { + continue; + } + + TSNode val_node = ts_node_named_child(pair, 1); + if (ts_node_is_null(val_node)) { + continue; + } + // Unwrap block_node if present + if (strcmp(ts_node_type(val_node), "block_node") == 0) { + val_node = ts_node_named_child(val_node, 0); + } + if (ts_node_is_null(val_node)) { + continue; + } + + if (strcmp(key, "kind") == 0) { + const char *v = get_scalar_text(a, val_node, ctx->source); + if (v) { + snprintf(kind_buf, kind_sz, "%s", v); + } + } else if (strcmp(key, "metadata") == 0) { + // Descend into metadata block_mapping to find "name" + // val_node is already unwrapped from block_node above. + TSNode meta_mapping = val_node; + if (ts_node_is_null(meta_mapping) || + strcmp(ts_node_type(meta_mapping), "block_mapping") != 0) { + continue; + } + uint32_t mn = ts_node_child_count(meta_mapping); + for (uint32_t mi = 0; mi < mn; mi++) { + TSNode mpair = ts_node_child(meta_mapping, mi); + if (strcmp(ts_node_type(mpair), "block_mapping_pair") != 0) { + continue; + } + TSNode mkey = ts_node_named_child(mpair, 0); + if (ts_node_is_null(mkey)) { + continue; + } + const char *mkey_text = get_scalar_text(a, mkey, ctx->source); + if (!mkey_text || strcmp(mkey_text, "name") != 0) { + continue; + } + TSNode mval = ts_node_named_child(mpair, 1); + if (ts_node_is_null(mval)) { + continue; + } + const char *meta_name = get_scalar_text(a, mval, ctx->source); + if (meta_name) { + snprintf(meta_name_buf, meta_sz, "%s", meta_name); + } + } + } + } +} + +static void extract_k8s_manifest(CBMExtractCtx *ctx) { + CBMArena *a = ctx->arena; + + TSNode root = ctx->root; + uint32_t root_n = ts_node_child_count(root); + for (uint32_t si = 0; si < root_n; si++) { + TSNode stream_child = ts_node_child(root, si); + if (strcmp(ts_node_type(stream_child), "document") != 0) { + continue; + } + + TSNode mapping = ts_node_named_child(stream_child, 0); + if (ts_node_is_null(mapping)) { + continue; + } + if (strcmp(ts_node_type(mapping), "block_node") == 0) { + mapping = ts_node_named_child(mapping, 0); + } + if (ts_node_is_null(mapping) || strcmp(ts_node_type(mapping), "block_mapping") != 0) { + continue; + } + + char kind_buf[256] = {0}; + char meta_name_buf[256] = {0}; + extract_k8s_scalars(ctx, mapping, kind_buf, sizeof(kind_buf), meta_name_buf, + sizeof(meta_name_buf)); + + // Skip malformed manifests (no kind or no metadata.name) + if (kind_buf[0] == '\0' || meta_name_buf[0] == '\0') { + continue; + } + + char def_name[512]; + snprintf(def_name, sizeof(def_name), "%s/%s", kind_buf, meta_name_buf); + + CBMDefinition def = {0}; + def.name = cbm_arena_strdup(a, def_name); + def.qualified_name = cbm_arena_sprintf(a, "%s.%s", ctx->module_qn, def_name); + def.label = cbm_arena_strdup(a, "Resource"); + def.file_path = ctx->rel_path; + def.start_line = ts_node_start_point(mapping).row + 1; + def.end_line = ts_node_end_point(mapping).row + 1; + cbm_defs_push(&ctx->result->defs, a, def); + + break; // Only the first document per file + } +} + +// --------------------------------------------------------------------------- +// Public entry point +// --------------------------------------------------------------------------- + +void cbm_extract_k8s(CBMExtractCtx *ctx) { + if (ctx->language == CBM_LANG_KUSTOMIZE) { + extract_kustomize(ctx); + } else if (ctx->language == CBM_LANG_K8S) { + extract_k8s_manifest(ctx); + } +} diff --git a/internal/cbm/lang_specs.c b/internal/cbm/lang_specs.c index a4fb809..426db94 100644 --- a/internal/cbm/lang_specs.c +++ b/internal/cbm/lang_specs.c @@ -1041,6 +1041,16 @@ static const CBMLangSpec lang_specs[CBM_LANG_COUNT] = { {CBM_LANG_WOLFRAM, wolfram_func_types, empty_types, empty_types, wolfram_module_types, wolfram_call_types, wolfram_import_types, empty_types, empty_types, empty_types, empty_types, empty_types, NULL, empty_types, NULL, NULL}, + + // CBM_LANG_KUSTOMIZE — reuses YAML grammar; semantic extraction via cbm_extract_k8s() + {CBM_LANG_KUSTOMIZE, empty_types, empty_types, empty_types, yaml_module_types, empty_types, + empty_types, empty_types, empty_types, empty_types, empty_types, empty_types, NULL, + empty_types, NULL, NULL}, + + // CBM_LANG_K8S — reuses YAML grammar; semantic extraction via cbm_extract_k8s() + {CBM_LANG_K8S, empty_types, empty_types, empty_types, yaml_module_types, empty_types, + empty_types, empty_types, empty_types, empty_types, empty_types, empty_types, NULL, + empty_types, NULL, NULL}, }; const CBMLangSpec *cbm_lang_spec(CBMLanguage lang) { @@ -1180,6 +1190,9 @@ const TSLanguage *cbm_ts_language(CBMLanguage lang) { return tree_sitter_magma(); case CBM_LANG_WOLFRAM: return tree_sitter_wolfram(); + case CBM_LANG_KUSTOMIZE: + case CBM_LANG_K8S: + return tree_sitter_yaml(); default: return NULL; } diff --git a/src/discover/language.c b/src/discover/language.c index b7eb7e4..ca91770 100644 --- a/src/discover/language.c +++ b/src/discover/language.c @@ -269,10 +269,21 @@ typedef struct { } filename_entry_t; static const filename_entry_t FILENAME_TABLE[] = { - {"CMakeLists.txt", CBM_LANG_CMAKE}, {"Dockerfile", CBM_LANG_DOCKERFILE}, - {"GNUmakefile", CBM_LANG_MAKEFILE}, {"Makefile", CBM_LANG_MAKEFILE}, - {"makefile", CBM_LANG_MAKEFILE}, {"meson.build", CBM_LANG_MESON}, - {"meson.options", CBM_LANG_MESON}, {"meson_options.txt", CBM_LANG_MESON}, + {"CMakeLists.txt", CBM_LANG_CMAKE}, + {"Dockerfile", CBM_LANG_DOCKERFILE}, + {"GNUmakefile", CBM_LANG_MAKEFILE}, + {"Makefile", CBM_LANG_MAKEFILE}, + {"makefile", CBM_LANG_MAKEFILE}, + {"meson.build", CBM_LANG_MESON}, + {"meson.options", CBM_LANG_MESON}, + {"meson_options.txt", CBM_LANG_MESON}, + {"kustomization.yaml", CBM_LANG_KUSTOMIZE}, + {"kustomization.yml", CBM_LANG_KUSTOMIZE}, + /* Note: FILENAME_TABLE uses case-sensitive strcmp, so mixed-case variants + * (e.g. "Kustomization.yaml") are not matched here. They fall through to + * CBM_LANG_YAML and are re-classified by cbm_is_kustomize_file() in + * pass_k8s.c, which performs a case-insensitive comparison. This is the + * intended behaviour — no additional entries are needed. */ {".vimrc", CBM_LANG_VIMSCRIPT}, }; @@ -345,6 +356,8 @@ static const char *LANG_NAMES[CBM_LANG_COUNT] = { [CBM_LANG_FORM] = "FORM", [CBM_LANG_MAGMA] = "Magma", [CBM_LANG_WOLFRAM] = "Wolfram", + [CBM_LANG_KUSTOMIZE] = "Kustomize", + [CBM_LANG_K8S] = "Kubernetes", }; /* ── Public API ──────────────────────────────────────────────────── */ diff --git a/src/pipeline/pass_infrascan.c b/src/pipeline/pass_infrascan.c index 30c47d9..1e30b71 100644 --- a/src/pipeline/pass_infrascan.c +++ b/src/pipeline/pass_infrascan.c @@ -192,6 +192,27 @@ bool cbm_is_env_file(const char *name) { return false; } +bool cbm_is_kustomize_file(const char *name) { + if (!name) { + return false; + } + char lower[256]; + to_lower(name, lower, sizeof(lower)); + return (strcmp(lower, "kustomization.yaml") == 0 || strcmp(lower, "kustomization.yml") == 0); +} + +// NOLINTNEXTLINE(bugprone-easily-swappable-parameters) +bool cbm_is_k8s_manifest(const char *name, const char *content) { + if (!name || !content || cbm_is_kustomize_file(name)) { + return false; + } + char buf[4097]; + size_t n = strnlen(content, 4096); + memcpy(buf, content, n); + buf[n] = '\0'; + return ci_strstr(buf, "apiVersion:") != NULL; +} + // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) bool cbm_is_shell_script(const char *name, const char *ext) { (void)name; diff --git a/src/pipeline/pass_k8s.c b/src/pipeline/pass_k8s.c new file mode 100644 index 0000000..03cc825 --- /dev/null +++ b/src/pipeline/pass_k8s.c @@ -0,0 +1,239 @@ +/* + * pass_k8s.c — Pipeline pass for Kubernetes manifest and Kustomize overlay processing. + * + * For each discovered YAML file: + * 1. Check if it is a kustomize overlay (kustomization.yaml / kustomization.yml) + * → emit a Module node and IMPORTS edges for each resources/bases/patches entry + * 2. Else if it is a generic k8s manifest (apiVersion: detected) + * → emit one Resource node per file (first document only — multi-document YAML is not yet supported) + * + * Depends on: pass_infrascan.c (cbm_is_kustomize_file, cbm_is_k8s_manifest, cbm_infra_qn), + * extraction layer (cbm.h), graph_buffer, pipeline internals. + */ +#include "pipeline/pipeline.h" +#include "pipeline/pipeline_internal.h" +#include "graph_buffer/graph_buffer.h" +#include "discover/discover.h" +#include "foundation/log.h" +#include "foundation/compat.h" +#include "cbm.h" + +#include +#include +#include + +/* ── Internal helpers ────────────────────────────────────────────── */ + +/* Read entire file into heap-allocated buffer. Returns NULL on error. + * Caller must free(). Sets *out_len to byte count. */ +static char *k8s_read_file(const char *path, int *out_len) { + FILE *f = fopen(path, "rb"); + if (!f) { + return NULL; + } + + (void)fseek(f, 0, SEEK_END); + long size = ftell(f); + (void)fseek(f, 0, SEEK_SET); + + if (size <= 0 || size > (long)100 * 1024 * 1024) { + (void)fclose(f); + return NULL; + } + + char *buf = malloc(size + 1); + if (!buf) { + (void)fclose(f); + return NULL; + } + + size_t nread = fread(buf, 1, size, f); + (void)fclose(f); + // NOLINTNEXTLINE(clang-analyzer-security.ArrayBound) + buf[nread] = '\0'; + *out_len = (int)nread; + return buf; +} + +/* Format int to string for logging. Thread-safe via TLS. */ +static const char *itoa_k8s(int val) { + static CBM_TLS char bufs[4][32]; + static CBM_TLS int idx = 0; + int i = idx; + idx = (idx + 1) & 3; + snprintf(bufs[i], sizeof(bufs[i]), "%d", val); + return bufs[i]; +} + +/* Extract the basename of a path (pointer into the string; no allocation). */ +static const char *k8s_basename(const char *path) { + const char *p = strrchr(path, '/'); + return p ? p + 1 : path; +} + +/* ── Kustomize handler ───────────────────────────────────────────── */ + +static void handle_kustomize(cbm_pipeline_ctx_t *ctx, const char *path, const char *rel_path, + CBMFileResult *result) { + /* Emit Module node for this kustomize overlay file */ + char *mod_qn = cbm_infra_qn(ctx->project_name, rel_path, "kustomize", NULL); + if (!mod_qn) { + return; + } + + // NOLINTNEXTLINE(misc-include-cleaner) + int64_t mod_id = cbm_gbuf_upsert_node(ctx->gbuf, "Module", k8s_basename(rel_path), mod_qn, + rel_path, 1, 0, "{\"source\":\"kustomize\"}"); + free(mod_qn); + + if (mod_id <= 0) { + return; + } + + /* If we have a cached extraction result, emit IMPORTS edges for + * resources/bases/patches/components entries */ + int import_count = 0; + CBMFileResult *res = result; + bool allocated = false; + + if (!res) { + /* Fall back to re-extraction */ + int src_len = 0; + char *source = k8s_read_file(path, &src_len); + if (source) { + res = cbm_extract_file(source, src_len, CBM_LANG_KUSTOMIZE, ctx->project_name, rel_path, + CBM_EXTRACT_BUDGET, NULL, NULL); + free(source); + allocated = true; + } + } + + if (res) { + for (int j = 0; j < res->imports.count; j++) { + CBMImport *imp = &res->imports.items[j]; + if (!imp->module_path) { + continue; + } + + /* Compute target file QN */ + char *target_qn = + cbm_pipeline_fqn_compute(ctx->project_name, imp->module_path, "__file__"); + if (!target_qn) { + continue; + } + + const cbm_gbuf_node_t *target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + free(target_qn); + + if (target) { + cbm_gbuf_insert_edge(ctx->gbuf, mod_id, target->id, "IMPORTS", + "{\"via\":\"kustomize\"}"); + import_count++; + } + } + + if (allocated) { + cbm_free_result(res); + } + } + + cbm_log_info("pass.k8s.kustomize", "file", rel_path, "imports", itoa_k8s(import_count)); +} + +/* ── K8s manifest handler ────────────────────────────────────────── */ + +/* source/src_len are the already-read file bytes (caller retains ownership and + * must free after this call returns). */ +static void handle_k8s_manifest(cbm_pipeline_ctx_t *ctx, const char *path, const char *rel_path, + const char *source, int src_len) { + (void)path; /* retained for symmetry; source is always provided now */ + int resource_count = 0; + + CBMFileResult *res = cbm_extract_file(source, src_len, CBM_LANG_K8S, ctx->project_name, + rel_path, CBM_EXTRACT_BUDGET, NULL, NULL); + if (!res) { + return; + } + + /* Compute file node QN for DEFINES edges */ + char *file_qn = cbm_pipeline_fqn_compute(ctx->project_name, rel_path, "__file__"); + const cbm_gbuf_node_t *file_node = file_qn ? cbm_gbuf_find_by_qn(ctx->gbuf, file_qn) : NULL; + free(file_qn); + + for (int d = 0; d < res->defs.count; d++) { + CBMDefinition *def = &res->defs.items[d]; + if (!def->label || strcmp(def->label, "Resource") != 0) { + continue; + } + if (!def->name || !def->qualified_name) { + continue; + } + + // NOLINTNEXTLINE(misc-include-cleaner) + int64_t node_id = + cbm_gbuf_upsert_node(ctx->gbuf, "Resource", def->name, def->qualified_name, rel_path, + (int)def->start_line, (int)def->end_line, "{\"source\":\"k8s\"}"); + + /* DEFINES edge: File → Resource */ + if (file_node && node_id > 0) { + cbm_gbuf_insert_edge(ctx->gbuf, file_node->id, node_id, "DEFINES", "{}"); + } + + resource_count++; + } + + cbm_free_result(res); + + cbm_log_info("pass.k8s.manifest", "file", rel_path, "resources", itoa_k8s(resource_count)); +} + +/* ── Pass entry point ────────────────────────────────────────────── */ + +// NOLINTNEXTLINE(misc-include-cleaner) — cbm_file_info_t provided by standard header +int cbm_pipeline_pass_k8s(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, int file_count) { + cbm_log_info("pass.start", "pass", "k8s", "files", itoa_k8s(file_count)); + + cbm_init(); + + int kustomize_count = 0; + int manifest_count = 0; + + for (int i = 0; i < file_count; i++) { + if (cbm_pipeline_check_cancel(ctx)) { + return -1; + } + + const char *path = files[i].path; + const char *rel = files[i].rel_path; + CBMLanguage lang = files[i].language; + const char *base = k8s_basename(rel); + + CBMFileResult *cached = + (ctx->result_cache && ctx->result_cache[i]) ? ctx->result_cache[i] : NULL; + + if (cbm_is_kustomize_file(base)) { + handle_kustomize(ctx, path, rel, cached); + kustomize_count++; + } else if (lang == CBM_LANG_YAML || lang == CBM_LANG_K8S) { + /* Read source once to classify (and reuse for uncached extraction). */ + int src_len = 0; + char *source = k8s_read_file(path, &src_len); + if (source) { + if (cbm_is_k8s_manifest(base, source)) { + /* Always re-extract with CBM_LANG_K8S regardless of any cached + * result: cached results were produced during the parallel YAML + * pass and contain no "Resource" definitions. Pass the already- + * read source buffer so handle_k8s_manifest does not re-read. */ + (void)cached; /* cached YAML result intentionally discarded */ + handle_k8s_manifest(ctx, path, rel, source, src_len); + manifest_count++; + } + free(source); + } + } + } + + cbm_log_info("pass.done", "pass", "k8s", "kustomize", itoa_k8s(kustomize_count), "manifests", + itoa_k8s(manifest_count)); + return 0; +} diff --git a/src/pipeline/pipeline.c b/src/pipeline/pipeline.c index f5b7510..303ed08 100644 --- a/src/pipeline/pipeline.c +++ b/src/pipeline/pipeline.c @@ -494,6 +494,16 @@ int cbm_pipeline_run(cbm_pipeline_t *p) { rc = -1; goto cleanup; } + + cbm_clock_gettime(CLOCK_MONOTONIC, &t); + rc = cbm_pipeline_pass_k8s(&ctx, files, file_count); + if (rc != 0) { /* log warning, continue */ + } + cbm_log_info("pass.timing", "pass", "k8s", "elapsed_ms", itoa_buf((int)elapsed_ms(t))); + if (check_cancel(p)) { + rc = -1; + goto cleanup; + } } else { cbm_log_info("pipeline.mode", "mode", "sequential", "files", itoa_buf(file_count)); @@ -518,6 +528,16 @@ int cbm_pipeline_run(cbm_pipeline_t *p) { goto seq_cleanup; } + cbm_clock_gettime(CLOCK_MONOTONIC, &t); + rc = cbm_pipeline_pass_k8s(&ctx, files, file_count); + if (rc != 0) { /* log warning, continue */ + } + cbm_log_info("pass.timing", "pass", "k8s", "elapsed_ms", itoa_buf((int)elapsed_ms(t))); + if (check_cancel(p)) { + rc = -1; + goto seq_cleanup; + } + cbm_clock_gettime(CLOCK_MONOTONIC, &t); rc = cbm_pipeline_pass_calls(&ctx, files, file_count); if (rc != 0) { diff --git a/src/pipeline/pipeline_incremental.c b/src/pipeline/pipeline_incremental.c index 78e2ffd..1799f83 100644 --- a/src/pipeline/pipeline_incremental.c +++ b/src/pipeline/pipeline_incremental.c @@ -265,6 +265,18 @@ int cbm_pipeline_run_incremental(cbm_pipeline_t *p, const char *db_path, cbm_fil cbm_log_info("pass.timing", "pass", "incr_semantic", "elapsed_ms", itoa_buf((int)elapsed_ms(t))); + /* k8s pass runs after semantic (vs. after definitions in the full pipeline) because + * incremental has no parallel extraction phase to position it alongside. + * Note: File→Resource DEFINES edges and cross-file kustomize IMPORTS edges are not + * emitted here — File nodes (from pass_structure) are absent in the incremental gbuf, + * and gbuf_find_by_qn only resolves nodes from changed files. This is a known + * structural limitation of the incremental architecture. */ + cbm_clock_gettime(CLOCK_MONOTONIC, &t); + if (cbm_pipeline_pass_k8s(&ctx, changed_files, ci) != 0) { + cbm_log_info("incremental.warn", "msg", "k8s_pass_failed"); + } + cbm_log_info("pass.timing", "pass", "incr_k8s", "elapsed_ms", itoa_buf((int)elapsed_ms(t))); + /* Merge new nodes/edges from gbuf into disk DB */ int new_nodes = cbm_gbuf_node_count(gbuf); int new_edges = cbm_gbuf_edge_count(gbuf); diff --git a/src/pipeline/pipeline_internal.h b/src/pipeline/pipeline_internal.h index c1d45f1..88862ff 100644 --- a/src/pipeline/pipeline_internal.h +++ b/src/pipeline/pipeline_internal.h @@ -218,6 +218,8 @@ bool cbm_is_compose_file(const char *name); bool cbm_is_cloudbuild_file(const char *name); bool cbm_is_env_file(const char *name); bool cbm_is_shell_script(const char *name, const char *ext); +bool cbm_is_kustomize_file(const char *name); +bool cbm_is_k8s_manifest(const char *name, const char *content); /* Secret detection */ bool cbm_is_secret_binding(const char *key, const char *value); @@ -353,6 +355,8 @@ int cbm_parallel_resolve(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, int cbm_pipeline_pass_definitions(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, int file_count); +int cbm_pipeline_pass_k8s(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, int file_count); + int cbm_pipeline_pass_calls(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, int file_count); /* Sub-passes called from pass_calls: pattern-based edge extraction */ diff --git a/tests/test_pipeline.c b/tests/test_pipeline.c index c2302a2..140a267 100644 --- a/tests/test_pipeline.c +++ b/tests/test_pipeline.c @@ -3331,6 +3331,30 @@ TEST(infra_is_dockerfile) { PASS(); } +TEST(infra_is_kustomize_file) { + ASSERT(cbm_is_kustomize_file("kustomization.yaml")); + ASSERT(cbm_is_kustomize_file("kustomization.yml")); + ASSERT(cbm_is_kustomize_file("KUSTOMIZATION.YAML")); /* case-insensitive */ + ASSERT(!cbm_is_kustomize_file("deployment.yaml")); + ASSERT(!cbm_is_kustomize_file("kustomize.yaml")); + ASSERT(!cbm_is_kustomize_file(NULL)); + PASS(); +} + +TEST(infra_is_k8s_manifest) { + const char *deploy = "apiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: my-app\n"; + const char *plain = "name: foo\nvalue: bar\n"; + const char *kust = "apiVersion: kustomize.config.k8s.io/v1beta1\nkind: Kustomization\n"; + + ASSERT(cbm_is_k8s_manifest("deployment.yaml", deploy)); + ASSERT(!cbm_is_k8s_manifest("deployment.yaml", plain)); + /* kustomize file should return false even if it has apiVersion */ + ASSERT(!cbm_is_k8s_manifest("kustomization.yaml", kust)); + ASSERT(!cbm_is_k8s_manifest(NULL, deploy)); + ASSERT(!cbm_is_k8s_manifest("deployment.yaml", NULL)); + PASS(); +} + TEST(infra_is_env_file) { ASSERT(cbm_is_env_file(".env")); ASSERT(cbm_is_env_file(".env.local")); @@ -4139,6 +4163,114 @@ TEST(infra_pipeline_idempotent) { PASS(); } +/* ── K8s / Kustomize extraction tests ──────────────────────────── */ + +TEST(k8s_extract_kustomize) { + const char *src = + "apiVersion: kustomize.config.k8s.io/v1beta1\n" + "kind: Kustomization\n" + "resources:\n" + " - deployment.yaml\n" + " - service.yaml\n"; + CBMFileResult *r = cbm_extract_file(src, (int)strlen(src), CBM_LANG_KUSTOMIZE, + "myproj", "base/kustomization.yaml", + 0, NULL, NULL); + ASSERT(r != NULL); + ASSERT_GTE(r->imports.count, 2); + + bool found_deploy = false, found_svc = false; + for (int i = 0; i < r->imports.count; i++) { + if (r->imports.items[i].module_path && + strcmp(r->imports.items[i].module_path, "deployment.yaml") == 0) + found_deploy = true; + if (r->imports.items[i].module_path && + strcmp(r->imports.items[i].module_path, "service.yaml") == 0) + found_svc = true; + } + ASSERT_TRUE(found_deploy); + ASSERT_TRUE(found_svc); + + cbm_free_result(r); + PASS(); +} + +TEST(k8s_extract_manifest) { + const char *src = + "apiVersion: apps/v1\n" + "kind: Deployment\n" + "metadata:\n" + " name: my-app\n" + " namespace: production\n"; + CBMFileResult *r = cbm_extract_file(src, (int)strlen(src), CBM_LANG_K8S, + "myproj", "k8s/deployment.yaml", + 0, NULL, NULL); + ASSERT(r != NULL); + ASSERT_GTE(r->defs.count, 1); + + bool found_resource = false; + for (int d = 0; d < r->defs.count; d++) { + if (r->defs.items[d].label && + strcmp(r->defs.items[d].label, "Resource") == 0 && + r->defs.items[d].name && + strstr(r->defs.items[d].name, "Deployment") != NULL) + found_resource = true; + } + ASSERT_TRUE(found_resource); + + cbm_free_result(r); + PASS(); +} + +TEST(k8s_extract_manifest_no_name) { + const char *src = "apiVersion: apps/v1\nkind: Deployment\n"; + CBMFileResult *r = cbm_extract_file(src, (int)strlen(src), CBM_LANG_K8S, + "myproj", "k8s/deploy.yaml", 0, NULL, NULL); + ASSERT(r != NULL); + /* No crash — defs count may be 0 because metadata.name is absent */ + ASSERT(!r->has_error); + cbm_free_result(r); + PASS(); +} + +TEST(k8s_extract_manifest_multidoc) { + /* Two-document YAML separated by "---". + * extract_k8s_manifest contains a "break" after the first successful push, + * so it processes only the first document that has both kind and + * metadata.name. This test pins that behaviour: the first document's + * resource must be present and no crash must occur. + * + * Note: with some tree-sitter YAML grammar versions the root stream may + * expose both documents as siblings; the break still fires after the first + * successful def push, so defs.count must be exactly 1. */ + const char *src = + "apiVersion: apps/v1\n" + "kind: Deployment\n" + "metadata:\n" + " name: my-app\n" + "---\n" + "apiVersion: v1\n" + "kind: Service\n" + "metadata:\n" + " name: my-svc\n"; + CBMFileResult *r = cbm_extract_file(src, (int)strlen(src), CBM_LANG_K8S, + "myproj", "k8s/multi.yaml", 0, NULL, NULL); + ASSERT(r != NULL); + ASSERT(!r->has_error); + /* First document's resource must be present */ + int found = 0; + for (int i = 0; i < r->defs.count; i++) { + if (r->defs.items[i].label && strcmp(r->defs.items[i].label, "Resource") == 0 && + r->defs.items[i].name && strcmp(r->defs.items[i].name, "Deployment/my-app") == 0) { + found = 1; + } + } + ASSERT(found); + /* At least one def, no more than one (only first document processed) */ + ASSERT(r->defs.count >= 1); + cbm_free_result(r); + PASS(); +} + /* ── Envscan tests (port of envscan_test.go) ───────────────────── */ /* Helper: write a file inside a temp dir */ @@ -4947,6 +5079,140 @@ TEST(incremental_new_file_added) { PASS(); } +TEST(incremental_k8s_manifest_indexed) { + /* Full index with a k8s manifest, then add a new manifest via incremental. + * Verifies that cbm_pipeline_pass_k8s() runs during incremental re-index. */ + char tmpdir[256]; + snprintf(tmpdir, sizeof(tmpdir), "/tmp/cbm_k8s_incr_XXXXXX"); + if (!cbm_mkdtemp(tmpdir)) { + SKIP("tmpdir"); + } + char dbpath[512]; + snprintf(dbpath, sizeof(dbpath), "%s/test.db", tmpdir); + char path[512]; + FILE *f; + + /* Initial manifest */ + snprintf(path, sizeof(path), "%s/deploy.yaml", tmpdir); + f = fopen(path, "w"); + ASSERT_NOT_NULL(f); + fprintf(f, "apiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: my-app\n"); + fclose(f); + + /* Full index */ + cbm_pipeline_t *p = cbm_pipeline_new(tmpdir, dbpath, CBM_MODE_FULL); + ASSERT_NOT_NULL(p); + ASSERT_EQ(cbm_pipeline_run(p), 0); + char *project = strdup(cbm_pipeline_project_name(p)); + cbm_pipeline_free(p); + + /* Verify Resource node created by full index */ + cbm_store_t *s = cbm_store_open_path(dbpath); + ASSERT_NOT_NULL(s); + cbm_node_t *nodes = NULL; + int count = 0; + cbm_store_find_nodes_by_label(s, project, "Resource", &nodes, &count); + ASSERT_GT(count, 0); + cbm_store_free_nodes(nodes, count); + cbm_store_close(s); + + /* Add a second manifest — incremental should pick it up */ + snprintf(path, sizeof(path), "%s/svc.yaml", tmpdir); + f = fopen(path, "w"); + ASSERT_NOT_NULL(f); + fprintf(f, "apiVersion: v1\nkind: Service\nmetadata:\n name: my-svc\n"); + fclose(f); + + /* Incremental re-index */ + p = cbm_pipeline_new(tmpdir, dbpath, CBM_MODE_FULL); + ASSERT_NOT_NULL(p); + ASSERT_EQ(cbm_pipeline_run(p), 0); + cbm_pipeline_free(p); + + /* Verify both Resource nodes now present */ + s = cbm_store_open_path(dbpath); + ASSERT_NOT_NULL(s); + nodes = NULL; + count = 0; + cbm_store_find_nodes_by_label(s, project, "Resource", &nodes, &count); + ASSERT_GTE(count, 2); + cbm_store_free_nodes(nodes, count); + cbm_store_close(s); + + free(project); + char cmd[512]; + snprintf(cmd, sizeof(cmd), "rm -rf '%s'", tmpdir); + (void)system(cmd); + PASS(); +} + +TEST(incremental_kustomize_module_indexed) { + /* Verifies that a kustomization.yaml added after the initial full index + * gets a Module node via the incremental k8s pass. */ + char tmpdir[256]; + snprintf(tmpdir, sizeof(tmpdir), "/tmp/cbm_kust_incr_XXXXXX"); + if (!cbm_mkdtemp(tmpdir)) { + SKIP("tmpdir"); + } + char dbpath[512]; + snprintf(dbpath, sizeof(dbpath), "%s/test.db", tmpdir); + char path[512]; + FILE *f; + + /* Initial resource manifest (gives full index something to find) */ + snprintf(path, sizeof(path), "%s/deploy.yaml", tmpdir); + f = fopen(path, "w"); + ASSERT_NOT_NULL(f); + fprintf(f, "apiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: my-app\n"); + fclose(f); + + /* Full index */ + cbm_pipeline_t *p = cbm_pipeline_new(tmpdir, dbpath, CBM_MODE_FULL); + ASSERT_NOT_NULL(p); + ASSERT_EQ(cbm_pipeline_run(p), 0); + char *project = strdup(cbm_pipeline_project_name(p)); + cbm_pipeline_free(p); + + /* Add kustomization.yaml */ + snprintf(path, sizeof(path), "%s/kustomization.yaml", tmpdir); + f = fopen(path, "w"); + ASSERT_NOT_NULL(f); + fprintf(f, "apiVersion: kustomize.config.k8s.io/v1beta1\n" + "kind: Kustomization\n" + "resources:\n" + " - deploy.yaml\n"); + fclose(f); + + /* Incremental re-index */ + p = cbm_pipeline_new(tmpdir, dbpath, CBM_MODE_FULL); + ASSERT_NOT_NULL(p); + ASSERT_EQ(cbm_pipeline_run(p), 0); + cbm_pipeline_free(p); + + /* Verify Module node created for the kustomization overlay */ + cbm_store_t *s = cbm_store_open_path(dbpath); + ASSERT_NOT_NULL(s); + cbm_node_t *nodes = NULL; + int count = 0; + cbm_store_find_nodes_by_label(s, project, "Module", &nodes, &count); + bool found_kust = false; + for (int i = 0; i < count; i++) { + if (nodes[i].properties_json && strstr(nodes[i].properties_json, "kustomize")) { + found_kust = true; + break; + } + } + cbm_store_free_nodes(nodes, count); + cbm_store_close(s); + ASSERT_TRUE(found_kust); + + free(project); + char cmd[512]; + snprintf(cmd, sizeof(cmd), "rm -rf '%s'", tmpdir); + (void)system(cmd); + PASS(); +} + SUITE(pipeline) { /* Lifecycle */ RUN_TEST(pipeline_create_free); @@ -5055,6 +5321,8 @@ SUITE(pipeline) { RUN_TEST(infra_is_cloudbuild_file); RUN_TEST(infra_is_shell_script); RUN_TEST(infra_is_dockerfile); + RUN_TEST(infra_is_kustomize_file); + RUN_TEST(infra_is_k8s_manifest); RUN_TEST(infra_is_env_file); RUN_TEST(infra_clean_json_brackets); RUN_TEST(infra_secret_detection); @@ -5083,6 +5351,11 @@ SUITE(pipeline) { /* Infrascan: pipeline integration */ RUN_TEST(infra_pipeline_integration); RUN_TEST(infra_pipeline_idempotent); + /* K8s / Kustomize extraction */ + RUN_TEST(k8s_extract_kustomize); + RUN_TEST(k8s_extract_manifest); + RUN_TEST(k8s_extract_manifest_no_name); + RUN_TEST(k8s_extract_manifest_multidoc); /* Env URL scanning */ RUN_TEST(envscan_dockerfile_env_urls); RUN_TEST(envscan_shell_env_urls); @@ -5130,4 +5403,6 @@ SUITE(pipeline) { RUN_TEST(incremental_detects_changed_file); RUN_TEST(incremental_detects_deleted_file); RUN_TEST(incremental_new_file_added); + RUN_TEST(incremental_k8s_manifest_indexed); + RUN_TEST(incremental_kustomize_module_indexed); }