Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

- Experimental: `GraphSchemaExtractionOutput`, `ExtractedNodeType`, `ExtractedRelationshipType`, and `ExtractedPropertyType` in `neo4j_graphrag.experimental.components.graph_schema_extraction` for schema-from-text LLM structured output; `Neo4jPropertyTypeName` type alias on `PropertyType`; `GraphSchema.from_extraction_output` and `validate_extraction_dict_to_graph_schema`; `make_strict_json_schema_for_structured_output` in `neo4j_graphrag.utils.json_schema_structured_output`.
- Experimental KG schemas: `GraphConstraintType` (`UNIQUENESS`, `EXISTENCE`) and extended `ConstraintType` so `EXISTENCE` can target a node property or a relationship property; graph pruning and schema visualization respect `EXISTENCE` constraints.
- Experimental: `GraphConstraintType.KEY` (Neo4j NODE KEY / RELATIONSHIP KEY, single property) on `GraphSchema.constraints`; pruning treats KEY like EXISTENCE for mandatory (non-null) properties. UNIQUENESS and KEY cannot target the same node property. Helpers: `key_property_names_for_node`, `key_property_names_for_relationship`, `uniqueness_property_names_for_node`, `mandatory_property_names_for_node`, `mandatory_property_names_for_relationship`.
- Experimental: `SchemaFromExistingGraphExtractor` maps Neo4j `NODE_KEY` / `RELATIONSHIP_KEY` metadata to `GraphConstraintType.KEY` (existence-only constraints still map to `EXISTENCE`).
- `LLMBase`: new abstract base class (`neo4j_graphrag.llm.LLMBase`) that combines `LLMInterface` and `LLMInterfaceV2`. Concrete LLM subclasses can extend `LLMBase` instead of both interfaces to avoid repeating overload boilerplate and to suppress mypy `[no-overload-impl]` / `[no-redef]` errors.
- MarkdownLoader (experimental): added a Markdown loader to support `.md` and `.markdown` files.
- Added Amazon Bedrock support: `BedrockLLM` (generation/tool calling) via the boto3 Converse API, and `BedrockEmbeddings` (embeddings) via the boto3 InvokeModel API.
Expand All @@ -16,6 +18,8 @@

### Changed

- **Breaking (experimental):** `ParquetWriter` success metadata `files[].columns` entries now include `is_unique` (bool) alongside `is_primary_key`. UNIQUENESS constraints set `is_unique: true` only; KEY constraints set `is_primary_key: true` (and `is_unique: false`). Synthetic `__id__` / relationship `from`/`to` columns keep `is_primary_key: true` where applicable. Consumers that assumed uniqueness constraints mapped to `is_primary_key` must use `is_unique` instead.
- Experimental Parquet helpers: `get_unique_properties_for_node_type` is deprecated (emits `DeprecationWarning`). It now mirrors `get_primary_key_column_names_for_node_type` (KEY / `__id__`), not UNIQUENESS-only property lists; use `get_uniqueness_property_names_for_node_type` or `get_primary_key_column_names_for_node_type` instead.
- Schema-from-text structured output (experimental): `SchemaFromTextExtractor` with `use_structured_output=True` now uses `GraphSchemaExtractionOutput` as `response_format` instead of `GraphSchema`, then converts to `GraphSchema` via `GraphSchema.from_extraction_output`. This keeps provider JSON schemas smaller while preserving the same runtime `GraphSchema` behavior.
- Experimental `GraphSchema`: `PropertyType.required` is deprecated in favor of `EXISTENCE` constraints on `GraphSchema.constraints`; legacy `required` flags are migrated on load. Uniqueness constraints no longer imply property existence—model mandatory properties with `EXISTENCE` explicitly (aligned with Neo4j-style constraint semantics).
- SimpleKG pipeline (experimental): the `from_pdf` parameter is deprecated in favor of `from_file` (PDF and Markdown inputs). `from_pdf` still works but emits a deprecation warning and will be removed in a future version.
Expand Down
8 changes: 4 additions & 4 deletions src/neo4j_graphrag/experimental/components/graph_pruning.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ def _enforce_properties(
Enforce properties:
- Ensure property type: for now, just prevent having invalid property types (e.g. map)
- Filter out those that are not in schema (i.e., valid properties) if allowed properties is False.
- Check that all EXISTENCE-constrained properties are present and not null.
- Check that all EXISTENCE- or KEY-constrained properties are present and not null.
"""
type_safe_properties = self._ensure_property_types(
item.properties,
Expand Down Expand Up @@ -474,11 +474,11 @@ def _check_required_properties(
schema_item: Union[NodeType, RelationshipType],
item: Union[Neo4jNode, Neo4jRelationship],
) -> list[str]:
"""Returns properties missing per EXISTENCE constraints (must be present and not null)."""
"""Returns properties missing per mandatory constraints (EXISTENCE or KEY; non-null)."""
if isinstance(item, Neo4jNode):
required_prop_names = schema.existence_property_names_for_node(item.label)
required_prop_names = schema.mandatory_property_names_for_node(item.label)
else:
required_prop_names = schema.existence_property_names_for_relationship(
required_prop_names = schema.mandatory_property_names_for_relationship(
item.type
)
declared_names = {p.name for p in schema_item.properties}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class ExtractedConstraintType(BaseModel):
when building :class:`~neo4j_graphrag.experimental.components.schema.GraphSchema`, not here.
"""

type: Literal["UNIQUENESS", "EXISTENCE"]
type: Literal["UNIQUENESS", "EXISTENCE", "KEY"]
property_name: str
node_type: str = ""
relationship_type: str = ""
Expand Down
43 changes: 33 additions & 10 deletions src/neo4j_graphrag/experimental/components/kg_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,22 @@


def _build_columns_from_schema(
schema: Any, primary_key_names: list[str]
schema: Any,
primary_key_names: list[str],
uniqueness_names: list[str],
) -> list[dict[str, Any]]:
"""Build a list of column dicts (name, type, is_primary_key) from a PyArrow schema."""
"""Build column dicts (name, type, is_primary_key, is_unique) from a PyArrow schema."""
columns: list[dict[str, Any]] = []
for i in range(len(schema)):
field = schema.field(i)
type_info = Neo4jGraphParquetFormatter.pyarrow_type_to_type_info(field.type)
name = field.name
columns.append(
{
"name": field.name,
"name": name,
"type": type_info.source_type,
"is_primary_key": field.name in primary_key_names,
"is_primary_key": name in primary_key_names,
"is_unique": name in uniqueness_names,
}
)
return columns
Expand Down Expand Up @@ -121,6 +125,9 @@ class KGWriterModel(DataModel):
- "statistics": dict with node_count, relationship_count, nodes_per_label,
rel_per_type, input_files_count, input_files_total_size_bytes.
- "files": list of file descriptors with file_path, etc. (ParquetWriter).
Each file entry includes ``columns``: a list of dicts with ``name``, ``type``,
``is_primary_key``, and ``is_unique`` (KEY / synthetic ``__id__`` / ``from``/``to``
vs UNIQUENESS constraints per :class:`~neo4j_graphrag.experimental.components.schema.GraphSchema`).
"""

status: Literal["SUCCESS", "FAILURE"]
Expand Down Expand Up @@ -355,7 +362,9 @@ async def run(
lexical_graph_config (LexicalGraphConfig): Used by the formatter for
lexical graph labels (e.g. __Entity__) and key properties.
schema (Optional[dict[str, Any]]): Optional GraphSchema as a dictionary for
uniqueness constraints and key properties. If not provided, ``__id__`` is used.
UNIQUENESS, KEY, and EXISTENCE constraints. Drives Parquet column metadata
(``is_unique`` vs ``is_primary_key``). If not provided, node files use ``__id__``
as the only primary-key column.
"""
try:
formatter = Neo4jGraphParquetFormatter(schema=schema)
Expand Down Expand Up @@ -384,9 +393,16 @@ async def run(
if meta.node_label is not None:
node_label_to_source_name[meta.node_label] = resolved_stem

pk_names = (
meta.primary_key_property_names
if meta.primary_key_property_names
else ["__id__"]
)
uq_names = meta.uniqueness_property_names or []
columns = _build_columns_from_schema(
meta.schema,
meta.key_properties or [],
pk_names,
uq_names,
)
name = meta.node_label or (
meta.labels[0] if meta.labels else resolved_stem
Expand Down Expand Up @@ -419,6 +435,7 @@ async def run(
columns = _build_columns_from_schema(
meta.schema,
["from", "to"],
[],
)
rel_name = (
f"{meta.relationship_head}_{meta.relationship_type}_{meta.relationship_tail}"
Expand All @@ -437,11 +454,17 @@ async def run(
"is_node": False,
"relationship_type": meta.relationship_type,
"start_node_source": start_node_source,
"start_node_primary_keys": meta.head_node_key_properties
or ["__id__"],
"start_node_primary_keys": (
meta.head_primary_key_property_names
if meta.head_primary_key_property_names
else ["__id__"]
),
"end_node_source": end_node_source,
"end_node_primary_keys": meta.tail_node_key_properties
or ["__id__"],
"end_node_primary_keys": (
meta.tail_primary_key_property_names
if meta.tail_primary_key_property_names
else ["__id__"]
),
}
)

Expand Down
145 changes: 97 additions & 48 deletions src/neo4j_graphrag/experimental/components/parquet_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import logging
import unicodedata
import warnings
from collections import defaultdict
from dataclasses import dataclass, field
from io import BytesIO
Expand Down Expand Up @@ -75,38 +76,79 @@ def sanitize_parquet_filestem(name: str) -> str:
return result


def get_unique_properties_for_node_type(
schema: Optional[dict[str, Any]], node_label: str
) -> list[str]:
"""Extract unique property names from schema constraints for a given node type.

1. If the schema has constraints, use uniqueness constraints
2. Otherwise, fall back to "__id__"
def _constraint_relationship_type_unset(constraint: dict[str, Any]) -> bool:
rt = constraint.get("relationship_type")
return rt is None or (isinstance(rt, str) and rt.strip() == "")

Args:
schema: The GraphSchema as a dictionary (may contain 'constraints' key)
node_label: The label for the node type

Returns:
List of property names that have uniqueness constraints
"""
default = ["__id__"]
def get_uniqueness_property_names_for_node_type(
schema: Optional[dict[str, Any]], node_label: str
) -> list[str]:
"""Property names with a UNIQUENESS constraint for this node label (order as in schema)."""
if not schema:
return []
out: list[str] = []
for constraint in schema.get("constraints", ()) or ():
if constraint.get("type") != "UNIQUENESS":
continue
if constraint.get("node_type", "") != node_label:
continue
pn = constraint.get("property_name", "")
if pn:
out.append(pn)
return out


def get_key_property_names_for_node_type(
schema: Optional[dict[str, Any]], node_label: str
) -> list[str]:
"""Property names with a KEY constraint (node scope) for this node label."""
if not schema:
return default
return []
out: list[str] = []
for constraint in schema.get("constraints", ()) or ():
if constraint.get("type") != "KEY":
continue
if constraint.get("node_type", "") != node_label:
continue
if not _constraint_relationship_type_unset(constraint):
continue
pn = constraint.get("property_name", "")
if pn:
out.append(pn)
return out


def get_primary_key_column_names_for_node_type(
schema: Optional[dict[str, Any]], node_label: str
) -> list[str]:
"""Column names flagged as primary key in KG writer metadata: KEY properties, else ``__id__``."""
keys = get_key_property_names_for_node_type(schema, node_label)
if keys:
return keys
return ["__id__"]

constraints = schema.get("constraints", ())
unique_properties: list[str] = []

for constraint in constraints:
# Check if this constraint applies to the node's label
if constraint.get("type") == "UNIQUENESS":
constraint_node_type = constraint.get("node_type", "")
if constraint_node_type == node_label:
property_name = constraint.get("property_name", "")
if property_name:
unique_properties.append(property_name)
def get_unique_properties_for_node_type(
schema: Optional[dict[str, Any]], node_label: str
) -> list[str]:
"""Deprecated synonym for :func:`get_primary_key_column_names_for_node_type`.

return unique_properties or default
Historically this returned UNIQUENESS-backed property names (with a ``__id__``
fallback). It now follows **primary-key** semantics (KEY constraints, else
``__id__``). Use :func:`get_uniqueness_property_names_for_node_type` or
:func:`get_primary_key_column_names_for_node_type` instead.
"""
warnings.warn(
"get_unique_properties_for_node_type is deprecated and its meaning has "
"changed: it now mirrors get_primary_key_column_names_for_node_type (KEY / "
"__id__), not UNIQUENESS-only lists. Use "
"get_uniqueness_property_names_for_node_type or "
"get_primary_key_column_names_for_node_type.",
DeprecationWarning,
stacklevel=2,
)
return get_primary_key_column_names_for_node_type(schema, node_label)
Comment thread
adamnsch marked this conversation as resolved.


@dataclass
Expand All @@ -123,10 +165,13 @@ class FileMetadata:
relationship_type: Optional[str] = None
relationship_head: Optional[str] = None
relationship_tail: Optional[str] = None
# Key property info - computed once by the formatter
key_properties: Optional[list[str]] = None # For nodes
head_node_key_properties: Optional[list[str]] = None # For relationships
tail_node_key_properties: Optional[list[str]] = None # For relationships
# Schema-driven column roles for KGWriter metadata (see ParquetWriter)
primary_key_property_names: Optional[list[str]] = None
uniqueness_property_names: Optional[list[str]] = None
head_primary_key_property_names: Optional[list[str]] = None
head_uniqueness_property_names: Optional[list[str]] = None
tail_primary_key_property_names: Optional[list[str]] = None
tail_uniqueness_property_names: Optional[list[str]] = None


@dataclass
Expand Down Expand Up @@ -311,20 +356,15 @@ def _nodes_to_rows(

return label_to_rows

def _get_key_property_name_for_label(self, node_label: str) -> Optional[str]:
"""Get the primary key property name for a node label from schema constraints.

Args:
node_label: The label of the node type

Returns:
The property name that is the primary key, or None if using default "__id__"
"""
unique_props = get_unique_properties_for_node_type(self.schema, node_label)
# If the only property is "__id__" (the default), return None to use node.id
if unique_props == ["__id__"]:
return None
return unique_props[0]
def _get_identity_property_name_for_label(self, node_label: str) -> Optional[str]:
"""Resolve natural identity: first KEY property, else first UNIQUENESS, else None (use ``__id__``)."""
key_props = get_key_property_names_for_node_type(self.schema, node_label)
if key_props:
return key_props[0]
uq = get_uniqueness_property_names_for_node_type(self.schema, node_label)
if uq:
return uq[0]
return None

def _get_node_key_property_value(self, node: Neo4jNode) -> Any:
"""Get the primary key property value for a node.
Expand All @@ -341,7 +381,7 @@ def _get_node_key_property_value(self, node: Neo4jNode) -> Any:
Raises:
ValueError: If the node is missing the key property or if the property value is null
"""
key_prop = self._get_key_property_name_for_label(node.label)
key_prop = self._get_identity_property_name_for_label(node.label)
if not key_prop:
# there is no key property, we use the node ID
return node.id
Expand Down Expand Up @@ -549,7 +589,10 @@ def format_graph(
is_node=True,
labels=labels_list,
node_label=label,
key_properties=get_unique_properties_for_node_type(
primary_key_property_names=get_key_property_names_for_node_type(
self.schema, label
),
uniqueness_property_names=get_uniqueness_property_names_for_node_type(
self.schema, label
),
)
Expand Down Expand Up @@ -577,10 +620,16 @@ def format_graph(
relationship_type=rtype,
relationship_head=head_label,
relationship_tail=tail_label,
head_node_key_properties=get_unique_properties_for_node_type(
head_primary_key_property_names=get_key_property_names_for_node_type(
self.schema, head_label
),
tail_node_key_properties=get_unique_properties_for_node_type(
head_uniqueness_property_names=get_uniqueness_property_names_for_node_type(
self.schema, head_label
),
tail_primary_key_property_names=get_key_property_names_for_node_type(
self.schema, tail_label
),
tail_uniqueness_property_names=get_uniqueness_property_names_for_node_type(
self.schema, tail_label
),
)
Expand Down
Loading
Loading