diff --git a/.github/workflows/ci_energyml_utils_pull_request.yml b/.github/workflows/ci_energyml_utils_pull_request.yml index 50380a7..4015294 100644 --- a/.github/workflows/ci_energyml_utils_pull_request.yml +++ b/.github/workflows/ci_energyml_utils_pull_request.yml @@ -32,9 +32,9 @@ jobs: with: python-version: "3.10" - - name: Install dependencies + - name: Install dependencies (all extras) run: | - poetry install + poetry install --all-extras - name: Run pytest run: | diff --git a/.github/workflows/ci_energyml_utils_release.yml b/.github/workflows/ci_energyml_utils_release.yml index 21b882c..28854bd 100644 --- a/.github/workflows/ci_energyml_utils_release.yml +++ b/.github/workflows/ci_energyml_utils_release.yml @@ -3,7 +3,6 @@ ## SPDX-License-Identifier: Apache-2.0 ## --- - name: Publish release defaults: @@ -19,7 +18,6 @@ jobs: name: Build distribution runs-on: ubuntu-latest steps: - - name: Checkout code uses: actions/checkout@v4 with: @@ -28,7 +26,7 @@ jobs: - name: Install poetry uses: ./.github/actions/prepare-poetry with: - python-version: '3.10' + python-version: "3.10" - name: Build run: | @@ -56,7 +54,6 @@ jobs: needs: [build] runs-on: ubuntu-latest steps: - # Retrieve the code and GIT history so that poetry-dynamic-versioning knows which version to upload - name: Checkout code uses: actions/checkout@v4 @@ -72,7 +69,7 @@ jobs: - name: Install poetry uses: ./.github/actions/prepare-poetry with: - python-version: '3.10' + python-version: "3.10" - name: Upload to PyPI run: | diff --git a/energyml-utils/.gitignore b/energyml-utils/.gitignore index f672e3c..b0e48a8 100644 --- a/energyml-utils/.gitignore +++ b/energyml-utils/.gitignore @@ -39,32 +39,45 @@ src/energyml/utils/converter/ # Other files requirements.txt +.github/ + + #doc/ sample/ gen*/ manip* *.epc *.h5 -*.off -*.obj *.log -*.geojson *.json *.csv *.zip + *.xml *.json docs/*.md # DATA *.obj +*.off +*.mtl *.geojson *.vtk +*.vtp +*.vtu *.stl +rc/specs +rc/**/*.epc +rc/**/*.h5 +rc/**/*.hdf5 # WIP src/energyml/utils/wip* scripts -rc/camunda \ No newline at end of file +rc/camunda + + +# code profiling +*.prof \ No newline at end of file diff --git a/energyml-utils/README.md b/energyml-utils/README.md index b29c45c..b292c01 100644 --- a/energyml-utils/README.md +++ b/energyml-utils/README.md @@ -86,27 +86,32 @@ The **EpcStreamReader** provides memory-efficient handling of large EPC files th - **Smart Caching**: LRU (Least Recently Used) cache with configurable size - **Automatic EPC Version Detection**: Supports both CLASSIC and EXPANDED EPC formats - **Add/Remove/Update Operations**: Full CRUD operations with automatic file structure maintenance +- **Relationship Management**: Automatic or manual .rels file updates with parallel processing support +- **External Data Arrays**: Read/write HDF5, Parquet, CSV arrays with intelligent file caching - **Context Management**: Automatic resource cleanup with `with` statements - **Memory Monitoring**: Track cache efficiency and memory usage statistics ### Basic Usage ```python -from energyml.utils.epc_stream import EpcStreamReader +from energyml.utils.epc_stream import EpcStreamReader, RelsUpdateMode # Open EPC file with context manager (recommended) -with EpcStreamReader('large_file.epc', cache_size=50) as reader: +with EpcStreamReader('large_file.epc', + cache_size=50, + rels_update_mode=RelsUpdateMode.UPDATE_ON_CLOSE) as reader: # List all objects without loading them - print(f"Total objects: {reader.stats.total_objects}") + print(f"Total objects: {len(reader)}") # Get object by identifier - obj: Any = reader.get_object_by_identifier("uuid.version") + obj = reader.get_object("uuid.version") - # Get objects by type - features: List[Any] = reader.get_objects_by_type("BoundaryFeature") + # List objects by type (returns metadata, not full objects) + features = reader.list_objects(object_type="BoundaryFeature") + print(f"Found {len(features)} features") # Get all objects with same UUID - versions: List[Any] = reader.get_object_by_uuid("12345678-1234-1234-1234-123456789abc") + versions = reader.get_object_by_uuid("12345678-1234-1234-1234-123456789abc") ``` ### Adding Objects @@ -135,31 +140,31 @@ with EpcStreamReader('my_file.epc') as reader: ```python with EpcStreamReader('my_file.epc') as reader: - # Remove specific version by full identifier - success = reader.remove_object("uuid.version") + # Remove by full identifier + success = reader.delete_object("uuid.version") - # Remove ALL versions by UUID only - success = reader.remove_object("12345678-1234-1234-1234-123456789abc") + # Or use the alias + success = reader.remove_object("uuid.version") if success: - print("Object(s) removed successfully") + print("Object removed successfully") ``` ### Updating Objects ```python -... +from energyml.utils.epc_stream import EpcStreamReader from energyml.utils.introspection import set_attribute_from_path with EpcStreamReader('my_file.epc') as reader: # Get existing object - obj = reader.get_object_by_identifier("uuid.version") + obj = reader.get_object("uuid.version") # Modify the object set_attribute_from_path(obj, "citation.title", "Updated Title") # Update in EPC file - new_identifier = reader.update_object(obj) + new_identifier = reader.put_object(obj) print(f"Updated object: {new_identifier}") ``` @@ -190,23 +195,71 @@ with EpcStreamReader('my_file.epc') as reader: # Objects added will use the same format as the existing EPC file ``` +### Relationship Management + +```python +from energyml.utils.epc_stream import EpcStreamReader, RelsUpdateMode + +# Choose relationship update strategy +with EpcStreamReader('my_file.epc', + rels_update_mode=RelsUpdateMode.UPDATE_ON_CLOSE, + enable_parallel_rels=True) as reader: + + # Add/modify objects - rels updated automatically based on mode + reader.add_object(my_object) + + # Manual rebuild of all relationships (e.g., after bulk operations) + stats = reader.rebuild_all_rels(clean_first=True) + print(f"Rebuilt {stats['rels_files_created']} .rels files") +``` + +### External Data Arrays + +```python +import numpy as np + +with EpcStreamReader('my_file.epc') as reader: + # Read array from HDF5/Parquet/CSV + data = reader.read_array( + proxy=my_representation, + path_in_external="/geometry/points" + ) + + # Write array to external file + new_data = np.array([[1, 2, 3], [4, 5, 6]]) + success = reader.write_array( + proxy=my_representation, + path_in_external="/geometry/points", + array=new_data + ) + + # Get metadata without loading full array + metadata = reader.get_array_metadata(my_representation) + print(f"Array shape: {metadata.dimensions}, dtype: {metadata.array_type}") +``` + ### Advanced Usage ```python -# Initialize without preloading metadata for faster startup -reader = EpcStreamReader('huge_file.epc', preload_metadata=False, cache_size=200) +# Initialize with persistent ZIP connection for better performance +reader = EpcStreamReader('huge_file.epc', + keep_open=True, + cache_size=200, + enable_parallel_rels=True, + parallel_worker_ratio=10) try: - # Manual metadata loading when needed - reader._load_metadata() - # Get object dependencies deps = reader.get_object_dependencies("uuid.version") # Batch processing with memory monitoring for obj_type in ["BoundaryFeature", "PropertyKind"]: - objects = reader.get_objects_by_type(obj_type) - print(f"Processing {len(objects)} {obj_type} objects") + obj_list = reader.list_objects(object_type=obj_type) + print(f"Processing {len(obj_list)} {obj_type} objects") + + for metadata in obj_list: + obj = reader.get_object(metadata.identifier) + # Process object... finally: reader.close() # Manual cleanup if not using context manager @@ -240,25 +293,159 @@ $env:PYTHONPATH="src" ``` -## Validation examples : -An epc file: +## Poetry Script Examples : + +### Validation + +Validate an EPC file: ```bash poetry run validate --file "path/to/your/energyml/object.epc" *> output_logs.json ``` -An xml file: +Validate an XML file: ```bash poetry run validate --file "path/to/your/energyml/object.xml" *> output_logs.json ``` -A json file: +Validate a JSON file: ```bash poetry run validate --file "path/to/your/energyml/object.json" *> output_logs.json ``` -A folder containing Epc/xml/json files: +Validate a folder containing EPC/XML/JSON files: ```bash poetry run validate --file "path/to/your/folder" *> output_logs.json ``` +Ignore specific error types (e.g., INFO): +```bash +poetry run validate --file "path/to/file.epc" --ignore-err-type INFO *> output_logs.json +``` + +Group errors by their class for better organization: +```bash +poetry run validate --file "path/to/file.epc" --group-by-err-class *> output_logs.json +``` + +Include PRODML version errors in validation (by default they are ignored): +```bash +poetry run validate --file "path/to/file.epc" --ignore-prodml-version-errs *> output_logs.json +``` + +Combined example with multiple options: +```bash +poetry run validate --file "path/to/file.epc" -i INFO WARNING --group-by-err-class *> output_logs.json +``` + +### Extract 3D Representations + +Extract all representations from an EPC to OBJ files: +```bash +poetry run extract_3d --epc "path/to/file.epc" --output "output_folder" +``` + +Extract specific representations by UUID: +```bash +poetry run extract_3d --epc "path/to/file.epc" --output "output_folder" --uuid "uuid1" "uuid2" +``` + +Extract to OFF format without CRS displacement: +```bash +poetry run extract_3d --epc "path/to/file.epc" --output "output_folder" --file-format OFF --no-crs +``` + +### CSV to Dataset + +Convert CSV to HDF5: +```bash +poetry run csv_to_dataset --csv "data.csv" --output "output.h5" +``` + +Convert CSV to Parquet with custom delimiter: +```bash +poetry run csv_to_dataset --csv "data.csv" --output "output.parquet" --csv-delimiter ";" +``` + +With dataset name prefix: +```bash +poetry run csv_to_dataset --csv "data.csv" --output "output.h5" --prefix "/my/path/" +``` + +With column mapping (JSON file): +```bash +poetry run csv_to_dataset --csv "data.csv" --output "output.h5" --mapping "mapping.json" +``` + +With inline column mapping: +```bash +poetry run csv_to_dataset --csv "data.csv" --output "output.h5" --mapping-line '{"DATASET_A": ["COL1", "COL2"], "DATASET_B": ["COL3"]}' +``` + +### Generate Random Data + +Generate a random RESQML object in JSON: +```bash +poetry run generate_data --type "energyml.resqml.v2_2.resqmlv2.TriangulatedSetRepresentation" --file-format json +``` + +Generate a random object in XML: +```bash +poetry run generate_data --type "energyml.resqml.v2_0_1.resqmlv2.Grid2dRepresentation" --file-format xml +``` + +Using qualified type: +```bash +poetry run generate_data --type "resqml22.WellboreFeature" --file-format json +``` + +### XML to JSON Conversion + +Convert an XML file to JSON: +```bash +poetry run xml_to_json --file "path/to/object.xml" +``` + +Convert with custom output path: +```bash +poetry run xml_to_json --file "path/to/object.xml" --out "output.json" +``` + +Convert entire EPC to JSON array: +```bash +poetry run xml_to_json --file "path/to/file.epc" --out "output.json" +``` + +### JSON to XML Conversion + +Convert a JSON file to XML: +```bash +poetry run json_to_xml --file "path/to/object.json" +``` + +Convert with custom output directory: +```bash +poetry run json_to_xml --file "path/to/object.json" --out "output_folder/" +``` + +### Describe as CSV + +Generate a CSV description of all objects in a folder: +```bash +poetry run describe_as_csv --folder "path/to/folder" +``` + +With custom columns: +```bash +poetry run describe_as_csv --folder "path/to/folder" \ + --columnsNames "Title" "Type" "UUID" \ + --columnsValues "citation.title" "$qualifiedType" "Uuid" +``` + +Available special values for columnsValues: +- `$type`: Object Python type +- `$qualifiedType`: EnergyML qualified type +- `$contentType`: EnergyML content type +- `$path`: File path +- `$dor`: UUIDs of referenced objects + diff --git a/energyml-utils/docs/src/energyml/index.html b/energyml-utils/docs/src/energyml/index.html deleted file mode 100644 index c188265..0000000 --- a/energyml-utils/docs/src/energyml/index.html +++ /dev/null @@ -1,73 +0,0 @@ - - - - - - -src.energyml API documentation - - - - - - - - - - - -
-
-
-

Module src.energyml

-
-
-
- -Expand source code - -
# Copyright (c) 2023-2024 Geosiris.
-# SPDX-License-Identifier: Apache-2.0
-
-
-
-

Sub-modules

-
-
src.energyml.utils
-
-

The energyml.utils module. -It contains tools for energyml management …

-
-
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/energyml-utils/docs/src/energyml/utils/data/hdf.html b/energyml-utils/docs/src/energyml/utils/data/hdf.html deleted file mode 100644 index 08b2205..0000000 --- a/energyml-utils/docs/src/energyml/utils/data/hdf.html +++ /dev/null @@ -1,621 +0,0 @@ - - - - - - -src.energyml.utils.data.hdf API documentation - - - - - - - - - - - -
-
-
-

Module src.energyml.utils.data.hdf

-
-
-
- -Expand source code - -
# Copyright (c) 2023-2024 Geosiris.
-# SPDX-License-Identifier: Apache-2.0
-from dataclasses import dataclass
-from io import BytesIO
-from typing import Optional, List, Tuple, Any, Union
-
-import h5py
-
-from ..epc import Epc, get_obj_identifier, ObjectNotFoundNotException, \
-    EPCRelsRelationshipType
-from ..introspection import search_attribute_matching_name_with_path, search_attribute_matching_name, \
-    get_obj_uuid, get_object_attribute
-
-
-@dataclass
-class DatasetReader:
-    def read_array(self, source: str, path_in_external_file: str) -> Optional[List[Any]]:
-        return None
-
-    def get_array_dimension(self, source: str, path_in_external_file: str) -> Optional[List[Any]]:
-        return None
-
-
-@dataclass
-class ETPReader(DatasetReader):
-    def read_array(self, obj_uri: str, path_in_external_file: str) -> Optional[List[Any]]:
-        return None
-
-    def get_array_dimension(self, source: str, path_in_external_file: str) -> Optional[List[Any]]:
-        return None
-
-
-@dataclass
-class HDF5FileReader(DatasetReader):
-    def read_array(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[Any]]:
-        with h5py.File(source, "r") as f:
-            d_group = f[path_in_external_file]
-            return d_group[()].tolist()
-
-    def get_array_dimension(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[Any]]:
-        with h5py.File(source, "r") as f:
-            return list(f[path_in_external_file].shape)
-
-    def extract_h5_datasets(
-            self, input_h5: Union[BytesIO, str], output_h5: Union[BytesIO, str], h5_datasets_paths: List[str]
-    ) -> None:
-        """
-        Copy all dataset from :param input_h5 matching with paths in :param h5_datasets_paths into the :param output
-        :param input_h5:
-        :param output_h5:
-        :param h5_datasets_paths:
-        :return:
-        """
-        if len(h5_datasets_paths) > 0:
-            with h5py.File(output_h5, "w") as f_dest:
-                with h5py.File(input_h5, "r") as f_src:
-                    for dataset in h5_datasets_paths:
-                        f_dest.create_dataset(dataset, data=f_src[dataset])
-
-
-def get_hdf_reference(obj) -> List[Any]:
-    """
-    See :func:`get_hdf_reference_with_path`. Only the value is returned, not the dot path into the object
-    :param obj:
-    :return:
-    """
-    return [
-        val
-        for path, val in get_hdf_reference_with_path(obj=obj)
-    ]
-
-
-def get_hdf_reference_with_path(obj: any) -> List[Tuple[str, Any]]:
-    """
-    See :func:`search_attribute_matching_name_with_path`. Search an attribute with type matching regex
-    "(PathInHdfFile|PathInExternalFile)".
-
-    :param obj:
-    :return: [ (Dot_Path_In_Obj, value), ...]
-    """
-    return search_attribute_matching_name_with_path(
-        obj,
-        "(PathInHdfFile|PathInExternalFile)"
-    )
-
-
-def get_crs_obj(
-        context_obj: Any,
-        path_in_root: Optional[str] = None,
-        root_obj: Optional[Any] = None,
-        epc: Optional[Epc] = None
-) -> Optional[Any]:
-    """
-    Search for the CRS object related to :param:`context_obj` into the :param:`epc`
-    :param context_obj:
-    :param path_in_root:
-    :param root_obj:
-    :param epc:
-    :return:
-    """
-    crs_list = search_attribute_matching_name(context_obj, r"\.*Crs", search_in_sub_obj=True, deep_search=False)
-    if crs_list is not None and len(crs_list) > 0:
-        crs = epc.get_object_by_identifier(get_obj_identifier(crs_list[0]))
-        if crs is None:
-            crs = epc.get_object_by_uuid(get_obj_uuid(crs_list[0]))
-        if crs is None:
-            raise ObjectNotFoundNotException(get_obj_identifier(crs_list[0]))
-        if crs is not None:
-            return crs
-
-    if context_obj != root_obj:
-        upper_path = path_in_root[:path_in_root.rindex(".")]
-        if len(upper_path) > 0:
-            return get_crs_obj(
-                context_obj=get_object_attribute(root_obj, upper_path),
-                path_in_root=upper_path,
-                root_obj=root_obj,
-                epc=epc,
-            )
-
-    return None
-
-
-def get_hdf5_path_from_external_path(
-        external_path_obj: Any,
-        path_in_root: Optional[str] = None,
-        root_obj: Optional[Any] = None,
-        epc: Optional[Epc] = None
-) -> Optional[str]:
-    """
-    Return the hdf5 file path (Searches for "uri" attribute or in :param:`epc` rels files).
-    :param external_path_obj: can be an attribute of an ExternalDataArrayPart
-    :param path_in_root:
-    :param root_obj:
-    :param epc:
-    :return:
-    """
-    if isinstance(external_path_obj, str):
-        # external_path_obj is maybe an attribute of an ExternalDataArrayPart, now search upper in the object
-        upper_path = path_in_root[:path_in_root.rindex(".")]
-        return get_hdf5_path_from_external_path(
-            external_path_obj=get_object_attribute(root_obj, upper_path),
-            path_in_root=upper_path,
-            root_obj=root_obj,
-            epc=epc,
-        )
-    elif type(external_path_obj).__name__ == "ExternalDataArrayPart":
-        epc_folder = epc.get_epc_file_folder()
-        h5_uri = search_attribute_matching_name(external_path_obj, "uri")
-        if h5_uri is not None and len(h5_uri) > 0:
-            return f"{epc_folder}/{h5_uri[0]}"
-    else:
-        epc_folder = epc.get_epc_file_folder()
-        hdf_proxy = search_attribute_matching_name(external_path_obj, "HdfProxy")[0]
-        if hdf_proxy is not None:
-            hdf_proxy_obj = epc.get_object_by_identifier(get_obj_identifier(hdf_proxy))
-            if hdf_proxy_obj is not None:
-                for rel in epc.additional_rels.get(get_obj_identifier(hdf_proxy_obj), []):
-                    # print(f"\trel : {rel}")
-                    if rel.type_value == EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type():
-                        return f"{epc_folder}/{rel.target}"
-    return None
-
-
-
-
-
-
-
-

Functions

-
-
-def get_crs_obj(context_obj: Any, path_in_root: Optional[str] = None, root_obj: Optional[Any] = None, epc: Optional[Epc] = None) ‑> Optional[Any] -
-
-

Search for the CRS object related to :param:context_obj into the :param:epc -:param context_obj: -:param path_in_root: -:param root_obj: -:param epc: -:return:

-
- -Expand source code - -
def get_crs_obj(
-        context_obj: Any,
-        path_in_root: Optional[str] = None,
-        root_obj: Optional[Any] = None,
-        epc: Optional[Epc] = None
-) -> Optional[Any]:
-    """
-    Search for the CRS object related to :param:`context_obj` into the :param:`epc`
-    :param context_obj:
-    :param path_in_root:
-    :param root_obj:
-    :param epc:
-    :return:
-    """
-    crs_list = search_attribute_matching_name(context_obj, r"\.*Crs", search_in_sub_obj=True, deep_search=False)
-    if crs_list is not None and len(crs_list) > 0:
-        crs = epc.get_object_by_identifier(get_obj_identifier(crs_list[0]))
-        if crs is None:
-            crs = epc.get_object_by_uuid(get_obj_uuid(crs_list[0]))
-        if crs is None:
-            raise ObjectNotFoundNotException(get_obj_identifier(crs_list[0]))
-        if crs is not None:
-            return crs
-
-    if context_obj != root_obj:
-        upper_path = path_in_root[:path_in_root.rindex(".")]
-        if len(upper_path) > 0:
-            return get_crs_obj(
-                context_obj=get_object_attribute(root_obj, upper_path),
-                path_in_root=upper_path,
-                root_obj=root_obj,
-                epc=epc,
-            )
-
-    return None
-
-
-
-def get_hdf5_path_from_external_path(external_path_obj: Any, path_in_root: Optional[str] = None, root_obj: Optional[Any] = None, epc: Optional[Epc] = None) ‑> Optional[str] -
-
-

Return the hdf5 file path (Searches for "uri" attribute or in :param:epc rels files). -:param external_path_obj: can be an attribute of an ExternalDataArrayPart -:param path_in_root: -:param root_obj: -:param epc: -:return:

-
- -Expand source code - -
def get_hdf5_path_from_external_path(
-        external_path_obj: Any,
-        path_in_root: Optional[str] = None,
-        root_obj: Optional[Any] = None,
-        epc: Optional[Epc] = None
-) -> Optional[str]:
-    """
-    Return the hdf5 file path (Searches for "uri" attribute or in :param:`epc` rels files).
-    :param external_path_obj: can be an attribute of an ExternalDataArrayPart
-    :param path_in_root:
-    :param root_obj:
-    :param epc:
-    :return:
-    """
-    if isinstance(external_path_obj, str):
-        # external_path_obj is maybe an attribute of an ExternalDataArrayPart, now search upper in the object
-        upper_path = path_in_root[:path_in_root.rindex(".")]
-        return get_hdf5_path_from_external_path(
-            external_path_obj=get_object_attribute(root_obj, upper_path),
-            path_in_root=upper_path,
-            root_obj=root_obj,
-            epc=epc,
-        )
-    elif type(external_path_obj).__name__ == "ExternalDataArrayPart":
-        epc_folder = epc.get_epc_file_folder()
-        h5_uri = search_attribute_matching_name(external_path_obj, "uri")
-        if h5_uri is not None and len(h5_uri) > 0:
-            return f"{epc_folder}/{h5_uri[0]}"
-    else:
-        epc_folder = epc.get_epc_file_folder()
-        hdf_proxy = search_attribute_matching_name(external_path_obj, "HdfProxy")[0]
-        if hdf_proxy is not None:
-            hdf_proxy_obj = epc.get_object_by_identifier(get_obj_identifier(hdf_proxy))
-            if hdf_proxy_obj is not None:
-                for rel in epc.additional_rels.get(get_obj_identifier(hdf_proxy_obj), []):
-                    # print(f"\trel : {rel}")
-                    if rel.type_value == EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type():
-                        return f"{epc_folder}/{rel.target}"
-    return None
-
-
-
-def get_hdf_reference(obj) ‑> List[Any] -
-
-

See :func:get_hdf_reference_with_path(). Only the value is returned, not the dot path into the object -:param obj: -:return:

-
- -Expand source code - -
def get_hdf_reference(obj) -> List[Any]:
-    """
-    See :func:`get_hdf_reference_with_path`. Only the value is returned, not the dot path into the object
-    :param obj:
-    :return:
-    """
-    return [
-        val
-        for path, val in get_hdf_reference_with_path(obj=obj)
-    ]
-
-
-
-def get_hdf_reference_with_path(obj: ) ‑> List[Tuple[str, Any]] -
-
-

See :func:search_attribute_matching_name_with_path. Search an attribute with type matching regex -"(PathInHdfFile|PathInExternalFile)".

-

:param obj: -:return: [ (Dot_Path_In_Obj, value), …]

-
- -Expand source code - -
def get_hdf_reference_with_path(obj: any) -> List[Tuple[str, Any]]:
-    """
-    See :func:`search_attribute_matching_name_with_path`. Search an attribute with type matching regex
-    "(PathInHdfFile|PathInExternalFile)".
-
-    :param obj:
-    :return: [ (Dot_Path_In_Obj, value), ...]
-    """
-    return search_attribute_matching_name_with_path(
-        obj,
-        "(PathInHdfFile|PathInExternalFile)"
-    )
-
-
-
-
-
-

Classes

-
-
-class DatasetReader -
-
-

DatasetReader()

-
- -Expand source code - -
@dataclass
-class DatasetReader:
-    def read_array(self, source: str, path_in_external_file: str) -> Optional[List[Any]]:
-        return None
-
-    def get_array_dimension(self, source: str, path_in_external_file: str) -> Optional[List[Any]]:
-        return None
-
-

Subclasses

- -

Methods

-
-
-def get_array_dimension(self, source: str, path_in_external_file: str) ‑> Optional[List[Any]] -
-
-
-
- -Expand source code - -
def get_array_dimension(self, source: str, path_in_external_file: str) -> Optional[List[Any]]:
-    return None
-
-
-
-def read_array(self, source: str, path_in_external_file: str) ‑> Optional[List[Any]] -
-
-
-
- -Expand source code - -
def read_array(self, source: str, path_in_external_file: str) -> Optional[List[Any]]:
-    return None
-
-
-
-
-
-class ETPReader -
-
-

ETPReader()

-
- -Expand source code - -
@dataclass
-class ETPReader(DatasetReader):
-    def read_array(self, obj_uri: str, path_in_external_file: str) -> Optional[List[Any]]:
-        return None
-
-    def get_array_dimension(self, source: str, path_in_external_file: str) -> Optional[List[Any]]:
-        return None
-
-

Ancestors

- -

Methods

-
-
-def get_array_dimension(self, source: str, path_in_external_file: str) ‑> Optional[List[Any]] -
-
-
-
- -Expand source code - -
def get_array_dimension(self, source: str, path_in_external_file: str) -> Optional[List[Any]]:
-    return None
-
-
-
-def read_array(self, obj_uri: str, path_in_external_file: str) ‑> Optional[List[Any]] -
-
-
-
- -Expand source code - -
def read_array(self, obj_uri: str, path_in_external_file: str) -> Optional[List[Any]]:
-    return None
-
-
-
-
-
-class HDF5FileReader -
-
-

HDF5FileReader()

-
- -Expand source code - -
@dataclass
-class HDF5FileReader(DatasetReader):
-    def read_array(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[Any]]:
-        with h5py.File(source, "r") as f:
-            d_group = f[path_in_external_file]
-            return d_group[()].tolist()
-
-    def get_array_dimension(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[Any]]:
-        with h5py.File(source, "r") as f:
-            return list(f[path_in_external_file].shape)
-
-    def extract_h5_datasets(
-            self, input_h5: Union[BytesIO, str], output_h5: Union[BytesIO, str], h5_datasets_paths: List[str]
-    ) -> None:
-        """
-        Copy all dataset from :param input_h5 matching with paths in :param h5_datasets_paths into the :param output
-        :param input_h5:
-        :param output_h5:
-        :param h5_datasets_paths:
-        :return:
-        """
-        if len(h5_datasets_paths) > 0:
-            with h5py.File(output_h5, "w") as f_dest:
-                with h5py.File(input_h5, "r") as f_src:
-                    for dataset in h5_datasets_paths:
-                        f_dest.create_dataset(dataset, data=f_src[dataset])
-
-

Ancestors

- -

Methods

-
-
-def extract_h5_datasets(self, input_h5: Union[_io.BytesIO, str], output_h5: Union[_io.BytesIO, str], h5_datasets_paths: List[str]) ‑> None -
-
-

Copy all dataset from :param input_h5 matching with paths in :param h5_datasets_paths into the :param output -:param input_h5: -:param output_h5: -:param h5_datasets_paths: -:return:

-
- -Expand source code - -
def extract_h5_datasets(
-        self, input_h5: Union[BytesIO, str], output_h5: Union[BytesIO, str], h5_datasets_paths: List[str]
-) -> None:
-    """
-    Copy all dataset from :param input_h5 matching with paths in :param h5_datasets_paths into the :param output
-    :param input_h5:
-    :param output_h5:
-    :param h5_datasets_paths:
-    :return:
-    """
-    if len(h5_datasets_paths) > 0:
-        with h5py.File(output_h5, "w") as f_dest:
-            with h5py.File(input_h5, "r") as f_src:
-                for dataset in h5_datasets_paths:
-                    f_dest.create_dataset(dataset, data=f_src[dataset])
-
-
-
-def get_array_dimension(self, source: Union[_io.BytesIO, str], path_in_external_file: str) ‑> Optional[List[Any]] -
-
-
-
- -Expand source code - -
def get_array_dimension(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[Any]]:
-    with h5py.File(source, "r") as f:
-        return list(f[path_in_external_file].shape)
-
-
-
-def read_array(self, source: Union[_io.BytesIO, str], path_in_external_file: str) ‑> Optional[List[Any]] -
-
-
-
- -Expand source code - -
def read_array(self, source: Union[BytesIO, str], path_in_external_file: str) -> Optional[List[Any]]:
-    with h5py.File(source, "r") as f:
-        d_group = f[path_in_external_file]
-        return d_group[()].tolist()
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/energyml-utils/docs/src/energyml/utils/data/helper.html b/energyml-utils/docs/src/energyml/utils/data/helper.html deleted file mode 100644 index 94a6a65..0000000 --- a/energyml-utils/docs/src/energyml/utils/data/helper.html +++ /dev/null @@ -1,1286 +0,0 @@ - - - - - - -src.energyml.utils.data.helper API documentation - - - - - - - - - - - -
-
-
-

Module src.energyml.utils.data.helper

-
-
-
- -Expand source code - -
# Copyright (c) 2023-2024 Geosiris.
-# SPDX-License-Identifier: Apache-2.0
-import inspect
-import sys
-from typing import Any, Optional, Callable, Literal, List, Union, Tuple
-
-from .hdf import get_hdf5_path_from_external_path, HDF5FileReader, get_hdf_reference, get_crs_obj
-from ..epc import Epc, get_obj_identifier
-from ..introspection import snake_case, get_object_attribute_no_verif, \
-    search_attribute_matching_name_with_path, search_attribute_matching_name, flatten_concatenation, \
-    search_attribute_in_upper_matching_name
-
-_ARRAY_NAMES_ = [
-    "BooleanArrayFromDiscretePropertyArray",
-    "BooleanArrayFromIndexArray",
-    "BooleanConstantArray",
-    "BooleanExternalArray",
-    "BooleanHdf5Array",
-    "BooleanXmlArray",
-    "CompoundExternalArray",
-    "DasTimeArray",
-    "DoubleConstantArray",
-    "DoubleHdf5Array",
-    "DoubleLatticeArray",
-    "ExternalDataArray",
-    "FloatingPointConstantArray",
-    "FloatingPointExternalArray",
-    "FloatingPointLatticeArray",
-    "FloatingPointXmlArray",
-    "IntegerArrayFromBooleanMaskArray",
-    "IntegerConstantArray",
-    "IntegerExternalArray",
-    "IntegerHdf5Array",
-    "IntegerLatticeArray",
-    "IntegerRangeArray",
-    "IntegerXmlArray",
-    "JaggedArray",
-    "ParametricLineArray",
-    "ParametricLineFromRepresentationLatticeArray",
-    "Point2DHdf5Array",
-    "Point3DFromRepresentationLatticeArray",
-    "Point3DHdf5Array",
-    "Point3DLatticeArray",
-    "Point3DParametricArray",
-    "Point3DZvalueArray",
-    "ResqmlJaggedArray",
-    "StringConstantArray",
-    "StringExternalArray",
-    "StringHdf5Array",
-    "StringXmlArray"
-]
-
-
-def get_array_reader_function(array_type_name: str) -> Optional[Callable]:
-    """
-    Returns the name of the potential appropriate function to read an object with type is named :param array_type_name
-    :param array_type_name: the initial type name
-    :return:
-    """
-    for name, obj in inspect.getmembers(sys.modules[__name__]):
-        if name == f"read_{snake_case(array_type_name)}":
-            return obj
-    return None
-
-
-def _array_name_mapping(array_type_name: str) -> str:
-    """
-    Transform the type name to match existing reader function
-    :param array_type_name:
-    :return:
-    """
-    array_type_name = array_type_name.replace("3D", "3d").replace("2D", "2d")
-    if array_type_name.endswith("ConstantArray"):
-        return "ConstantArray"
-    elif "External" in array_type_name or "Hdf5" in array_type_name:
-        return "ExternalArray"
-    elif array_type_name.endswith("XmlArray"):
-        return "XmlArray"
-    elif "Jagged" in array_type_name:
-        return "JaggedArray"
-    elif "Lattice" in array_type_name:
-        if "Integer" in array_type_name or "Double" in array_type_name:
-            return "int_double_lattice_array"
-    return array_type_name
-
-
-def read_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List[Any]:
-    """
-    Read an array and return a list. The array is read depending on its type. see. :py:func:`energyml.utils.data.helper.get_supported_array`
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    if isinstance(energyml_array, list):
-        return energyml_array
-    array_type_name = _array_name_mapping(type(energyml_array).__name__)
-
-    reader_func = get_array_reader_function(array_type_name)
-    if reader_func is not None:
-        return reader_func(
-            energyml_array=energyml_array,
-            root_obj=root_obj,
-            path_in_root=path_in_root,
-            epc=epc,
-        )
-    else:
-        print(f"Type {array_type_name} is not supported: function read_{snake_case(array_type_name)} not found")
-        raise Exception(f"Type {array_type_name} is not supported\n\t{energyml_array}: \n\tfunction read_{snake_case(array_type_name)} not found")
-
-
-def get_supported_array() -> List[str]:
-    """
-    Return a list of the supported arrays for the use of :py:func:`energyml.utils.data.helper.read_array` function.
-    :return:
-    """
-    return [x for x in _ARRAY_NAMES_ if get_array_reader_function(_array_name_mapping(x)) is not None]
-
-
-def get_not_supported_array():
-    """
-    Return a list of the NOT supported arrays for the use of :py:func:`energyml.utils.data.helper.read_array` function.
-    :return:
-    """
-    return [x for x in _ARRAY_NAMES_ if get_array_reader_function(_array_name_mapping(x)) is None]
-
-
-def read_constant_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List[Any]:
-    """
-    Read a constant array ( BooleanConstantArray, DoubleConstantArray, FloatingPointConstantArray, IntegerConstantArray ...)
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    # print(f"Reading constant array\n\t{energyml_array}")
-
-    value = get_object_attribute_no_verif(energyml_array, "value")
-    count = get_object_attribute_no_verif(energyml_array, "count")
-
-    # print(f"\tValue : {[value for i in range(0, count)]}")
-
-    return [value for i in range(0, count)]
-
-
-def read_xml_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List[Any]:
-    """
-    Read a xml array ( BooleanXmlArray, FloatingPointXmlArray, IntegerXmlArray, StringXmlArray ...)
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    values = get_object_attribute_no_verif(energyml_array, "values")
-    # count = get_object_attribute_no_verif(energyml_array, "count_per_value")
-    return values
-
-
-def read_jagged_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List[Any]:
-    """
-    Read a jagged array
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    elements = read_array(
-        energyml_array=get_object_attribute_no_verif(energyml_array, "elements"),
-        root_obj=root_obj,
-        path_in_root=path_in_root + ".elements",
-        epc=epc,
-    )
-    cumulative_length = read_array(
-        energyml_array=read_array(get_object_attribute_no_verif(energyml_array, "cumulative_length")),
-        root_obj=root_obj,
-        path_in_root=path_in_root + ".cumulative_length",
-        epc=epc,
-    )
-
-    res = []
-    previous = 0
-    for cl in cumulative_length:
-        res.append(elements[previous: cl])
-        previous = cl
-    return res
-
-
-def read_external_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List[Any]:
-    """
-    Read an external array (BooleanExternalArray, BooleanHdf5Array, DoubleHdf5Array, IntegerHdf5Array, StringExternalArray ...)
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    hdf5_path = get_hdf5_path_from_external_path(
-                external_path_obj=energyml_array,
-                path_in_root=path_in_root,
-                root_obj=root_obj,
-                epc=epc,
-    )
-    h5_reader = HDF5FileReader()
-    path_in_external = get_hdf_reference(energyml_array)[0]
-
-    result_array = h5_reader.read_array(hdf5_path, path_in_external)
-
-    if path_in_root.lower().endswith("points") and len(result_array) > 0 and len(result_array[0]) == 3:
-        crs = get_crs_obj(
-            context_obj=energyml_array,
-            path_in_root=path_in_root,
-            root_obj=root_obj,
-            epc=epc,
-        )
-        zincreasing_downward = is_z_reversed(crs)
-
-        if zincreasing_downward:
-            result_array = list(map(lambda p: [p[0], p[1], -p[2]], result_array))
-
-    return result_array
-
-
-def read_int_double_lattice_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-):
-    """
-    Read DoubleLatticeArray or IntegerLatticeArray.
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    start_value = get_object_attribute_no_verif(energyml_array, "start_value")
-    offset = get_object_attribute_no_verif(energyml_array, "offset")
-
-    result = []
-
-    # if len(offset) == 1:
-    #     pass
-    # elif len(offset) == 2:
-    #     pass
-    # else:
-    raise Exception(f"{type(energyml_array)} read with an offset of length {len(offset)} is not supported")
-
-    # return result
-
-
-def _point_as_array(point: Any) -> List:
-    """
-    Transform a point that has "coordinate1", "coordinate2", "coordinate3" as attributes into a list.
-    :param point:
-    :return:
-    """
-    return [
-        get_object_attribute_no_verif(point, "coordinate1"),
-        get_object_attribute_no_verif(point, "coordinate2"),
-        get_object_attribute_no_verif(point, "coordinate3"),
-    ]
-
-
-def prod_n_tab(val: Union[float, int, str], tab: List[Union[float, int, str]]):
-    """
-    Multiply every value of the list 'tab' by the constant 'val'
-    :param val:
-    :param tab:
-    :return:
-    """
-    return list(map(lambda x: x*val, tab))
-
-
-def sum_lists(l1: List, l2: List):
-    """
-    Sums 2 lists values.
-
-    Example:
-        [1,1,1] and [2,2,3,6] gives : [3,3,4,6]
-
-    :param l1:
-    :param l2:
-    :return:
-    """
-    return [l1[i] + l2[i] for i in range(min(len(l1), len(l2)))]+max(l1, l2, key=len)[min(len(l1), len(l2)):]
-
-
-def read_point3d_zvalue_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-):
-    """
-    Read a Point3D2ValueArray
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    supporting_geometry = get_object_attribute_no_verif(energyml_array, "supporting_geometry")
-    sup_geom_array = read_array(
-        energyml_array=supporting_geometry,
-        root_obj=root_obj,
-        path_in_root=path_in_root + ".SupportingGeometry",
-        epc=epc,
-    )
-
-    zvalues = get_object_attribute_no_verif(energyml_array, "zvalues")
-    zvalues_array = flatten_concatenation(read_array(
-        energyml_array=zvalues,
-        root_obj=root_obj,
-        path_in_root=path_in_root + ".ZValues",
-        epc=epc,
-    ))
-
-    count = 0
-
-    for i in range(len(sup_geom_array)):
-        try:
-            sup_geom_array[i][2] = zvalues_array[i]
-        except Exception as e:
-            if count == 0:
-                print(e, f": {i} is out of bound of {len(zvalues_array)}")
-                count = count + 1
-
-    return sup_geom_array
-
-
-def read_point3d_from_representation_lattice_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-):
-    """
-    Read a Point3DFromRepresentationLatticeArray.
-
-    Note: Only works for Grid2DRepresentation.
-
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    supporting_rep_identifier = get_obj_identifier(get_object_attribute_no_verif(energyml_array, "supporting_representation"))
-    print(f"energyml_array : {energyml_array}\n\t{supporting_rep_identifier}")
-    supporting_rep = epc.get_object_by_identifier(supporting_rep_identifier)
-
-    # TODO chercher un pattern \.*patch\.*.[d]+ pour trouver le numero du patch dans le path_in_root puis lire le patch
-    # print(f"path_in_root {path_in_root}")
-
-    result = []
-    if "grid2d" in str(type(supporting_rep)).lower():
-        patch_path, patch = search_attribute_matching_name_with_path(supporting_rep, "Grid2dPatch")[0]
-        points = read_grid2d_patch(
-            patch=patch,
-            grid2d=supporting_rep,
-            path_in_root=patch_path,
-            epc=epc,
-        )
-        # TODO: take the points by there indices from the NodeIndicesOnSupportingRepresentation
-        result = points
-
-    else:
-        raise Exception(f"Not supported type {type(energyml_array)} for object {type(root_obj)}")
-    # pour trouver les infos qu'il faut
-    return result
-
-
-def read_grid2d_patch(
-        patch: Any,
-        grid2d: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List:
-    points_path, points_obj = search_attribute_matching_name_with_path(patch, "Geometry.Points")[0]
-
-    return read_array(
-        energyml_array=points_obj,
-        root_obj=grid2d,
-        path_in_root=path_in_root + points_path,
-        epc=epc,
-    )
-
-
-def read_point3d_lattice_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List:
-    """
-    Read a Point3DLatticeArray.
-
-    Note: If a CRS is found and its 'ZIncreasingDownward' is set to true or its
-
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    result = []
-    origin = _point_as_array(get_object_attribute_no_verif(energyml_array, "origin"))
-    offset = get_object_attribute_no_verif(energyml_array, "offset")
-
-    if len(offset) == 2:
-        slowest = offset[0]
-        fastest = offset[1]
-
-        crs_sa_count = search_attribute_in_upper_matching_name(
-            obj=energyml_array,
-            name_rgx="SlowestAxisCount",
-            root_obj=root_obj,
-            current_path=path_in_root,
-        )
-
-        crs_fa_count = search_attribute_in_upper_matching_name(
-            obj=energyml_array,
-            name_rgx="FastestAxisCount",
-            root_obj=root_obj,
-            current_path=path_in_root,
-        )
-
-        crs = get_crs_obj(
-            context_obj=energyml_array,
-            path_in_root=path_in_root,
-            root_obj=root_obj,
-            epc=epc,
-        )
-        zincreasing_downward = is_z_reversed(crs)
-
-        slowest_vec = _point_as_array(get_object_attribute_no_verif(slowest, "offset"))
-        slowest_spacing = read_array(get_object_attribute_no_verif(slowest, "spacing"))
-        slowest_table = list(map(lambda x: prod_n_tab(x, slowest_vec), slowest_spacing))
-
-        fastest_vec = _point_as_array(get_object_attribute_no_verif(fastest, "offset"))
-        fastest_spacing = read_array(get_object_attribute_no_verif(fastest, "spacing"))
-        fastest_table = list(map(lambda x: prod_n_tab(x, fastest_vec), fastest_spacing))
-
-        slowest_size = len(slowest_table)
-        fastest_size = len(fastest_table)
-
-        if len(crs_sa_count) > 0 and len(crs_fa_count) and crs_sa_count[0] == fastest_size:
-            print("reversing order")
-            # if offset were given in the wrong order
-            tmp_table = slowest_table
-            slowest_table = fastest_table
-            fastest_table = tmp_table
-
-            tmp_size = slowest_size
-            slowest_size = fastest_size
-            fastest_size = tmp_size
-
-        for i in range(slowest_size):
-            for j in range(fastest_size):
-                previous_value = origin
-                # to avoid a sum of the parts of the array at each iteration, I take the previous value in the same line
-                # number i and add the fastest_table[j] value
-
-                if j > 0:
-                    if i > 0:
-                        line_idx = i * fastest_size  # numero de ligne
-                        previous_value = result[line_idx + j - 1]
-                    else:
-                        previous_value = result[j - 1]
-                    if zincreasing_downward:
-                        result.append(sum_lists(previous_value, slowest_table[i - 1]))
-                    else:
-                        result.append(sum_lists(previous_value, fastest_table[j - 1]))
-                else:
-                    if i > 0:
-                        prev_line_idx = (i - 1) * fastest_size  # numero de ligne precedent
-                        previous_value = result[prev_line_idx]
-                        if zincreasing_downward:
-                            result.append(sum_lists(previous_value, fastest_table[j - 1]))
-                        else:
-                            result.append(sum_lists(previous_value, slowest_table[i - 1]))
-                    else:
-                        result.append(previous_value)
-    else:
-        raise Exception(f"{type(energyml_array)} read with an offset of length {len(offset)} is not supported")
-
-    return result
-
-
-def is_z_reversed(crs: Optional[Any]) -> bool:
-    """
-    Returns True if the Z axe is reverse (ZIncreasingDownward=='True' or VerticalAxis.Direction=='down')
-    :param crs:
-    :return: By default, False is returned (if 'crs' is None)
-    """
-    reverse_z_values = False
-    if crs is not None:
-        # resqml 201
-        zincreasing_downward = search_attribute_matching_name(crs, "ZIncreasingDownward")
-        if len(zincreasing_downward) > 0:
-            reverse_z_values = zincreasing_downward[0]
-
-        # resqml >= 22
-        vert_axis = search_attribute_matching_name(crs, "VerticalAxis.Direction")
-        if len(vert_axis) > 0:
-            reverse_z_values = vert_axis[0].lower() == "down"
-
-    return reverse_z_values
-
-
-# def read_boolean_constant_array(
-#         energyml_array: Any,
-#         root_obj: Optional[Any] = None,
-#         path_in_root: Optional[str] = None,
-#         epc: Optional[Epc] = None
-# ):
-#     print(energyml_array)
-
-
-
-
-
-
-
-

Functions

-
-
-def get_array_reader_function(array_type_name: str) ‑> Optional[Callable] -
-
-

Returns the name of the potential appropriate function to read an object with type is named :param array_type_name -:param array_type_name: the initial type name -:return:

-
- -Expand source code - -
def get_array_reader_function(array_type_name: str) -> Optional[Callable]:
-    """
-    Returns the name of the potential appropriate function to read an object with type is named :param array_type_name
-    :param array_type_name: the initial type name
-    :return:
-    """
-    for name, obj in inspect.getmembers(sys.modules[__name__]):
-        if name == f"read_{snake_case(array_type_name)}":
-            return obj
-    return None
-
-
-
-def get_not_supported_array() -
-
-

Return a list of the NOT supported arrays for the use of :py:func:energyml.utils.data.helper.read_array function. -:return:

-
- -Expand source code - -
def get_not_supported_array():
-    """
-    Return a list of the NOT supported arrays for the use of :py:func:`energyml.utils.data.helper.read_array` function.
-    :return:
-    """
-    return [x for x in _ARRAY_NAMES_ if get_array_reader_function(_array_name_mapping(x)) is None]
-
-
-
-def get_supported_array() ‑> List[str] -
-
-

Return a list of the supported arrays for the use of :py:func:energyml.utils.data.helper.read_array function. -:return:

-
- -Expand source code - -
def get_supported_array() -> List[str]:
-    """
-    Return a list of the supported arrays for the use of :py:func:`energyml.utils.data.helper.read_array` function.
-    :return:
-    """
-    return [x for x in _ARRAY_NAMES_ if get_array_reader_function(_array_name_mapping(x)) is not None]
-
-
-
-def is_z_reversed(crs: Optional[Any]) ‑> bool -
-
-

Returns True if the Z axe is reverse (ZIncreasingDownward=='True' or VerticalAxis.Direction=='down') -:param crs: -:return: By default, False is returned (if 'crs' is None)

-
- -Expand source code - -
def is_z_reversed(crs: Optional[Any]) -> bool:
-    """
-    Returns True if the Z axe is reverse (ZIncreasingDownward=='True' or VerticalAxis.Direction=='down')
-    :param crs:
-    :return: By default, False is returned (if 'crs' is None)
-    """
-    reverse_z_values = False
-    if crs is not None:
-        # resqml 201
-        zincreasing_downward = search_attribute_matching_name(crs, "ZIncreasingDownward")
-        if len(zincreasing_downward) > 0:
-            reverse_z_values = zincreasing_downward[0]
-
-        # resqml >= 22
-        vert_axis = search_attribute_matching_name(crs, "VerticalAxis.Direction")
-        if len(vert_axis) > 0:
-            reverse_z_values = vert_axis[0].lower() == "down"
-
-    return reverse_z_values
-
-
-
-def prod_n_tab(val: Union[float, int, str], tab: List[Union[float, int, str]]) -
-
-

Multiply every value of the list 'tab' by the constant 'val' -:param val: -:param tab: -:return:

-
- -Expand source code - -
def prod_n_tab(val: Union[float, int, str], tab: List[Union[float, int, str]]):
-    """
-    Multiply every value of the list 'tab' by the constant 'val'
-    :param val:
-    :param tab:
-    :return:
-    """
-    return list(map(lambda x: x*val, tab))
-
-
-
-def read_array(energyml_array: Any, root_obj: Optional[Any] = None, path_in_root: Optional[str] = None, epc: Optional[Epc] = None) ‑> List[Any] -
-
-

Read an array and return a list. The array is read depending on its type. see. :py:func:energyml.utils.data.helper.get_supported_array -:param energyml_array: -:param root_obj: -:param path_in_root: -:param epc: -:return:

-
- -Expand source code - -
def read_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List[Any]:
-    """
-    Read an array and return a list. The array is read depending on its type. see. :py:func:`energyml.utils.data.helper.get_supported_array`
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    if isinstance(energyml_array, list):
-        return energyml_array
-    array_type_name = _array_name_mapping(type(energyml_array).__name__)
-
-    reader_func = get_array_reader_function(array_type_name)
-    if reader_func is not None:
-        return reader_func(
-            energyml_array=energyml_array,
-            root_obj=root_obj,
-            path_in_root=path_in_root,
-            epc=epc,
-        )
-    else:
-        print(f"Type {array_type_name} is not supported: function read_{snake_case(array_type_name)} not found")
-        raise Exception(f"Type {array_type_name} is not supported\n\t{energyml_array}: \n\tfunction read_{snake_case(array_type_name)} not found")
-
-
-
-def read_constant_array(energyml_array: Any, root_obj: Optional[Any] = None, path_in_root: Optional[str] = None, epc: Optional[Epc] = None) ‑> List[Any] -
-
-

Read a constant array ( BooleanConstantArray, DoubleConstantArray, FloatingPointConstantArray, IntegerConstantArray …) -:param energyml_array: -:param root_obj: -:param path_in_root: -:param epc: -:return:

-
- -Expand source code - -
def read_constant_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List[Any]:
-    """
-    Read a constant array ( BooleanConstantArray, DoubleConstantArray, FloatingPointConstantArray, IntegerConstantArray ...)
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    # print(f"Reading constant array\n\t{energyml_array}")
-
-    value = get_object_attribute_no_verif(energyml_array, "value")
-    count = get_object_attribute_no_verif(energyml_array, "count")
-
-    # print(f"\tValue : {[value for i in range(0, count)]}")
-
-    return [value for i in range(0, count)]
-
-
-
-def read_external_array(energyml_array: Any, root_obj: Optional[Any] = None, path_in_root: Optional[str] = None, epc: Optional[Epc] = None) ‑> List[Any] -
-
-

Read an external array (BooleanExternalArray, BooleanHdf5Array, DoubleHdf5Array, IntegerHdf5Array, StringExternalArray …) -:param energyml_array: -:param root_obj: -:param path_in_root: -:param epc: -:return:

-
- -Expand source code - -
def read_external_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List[Any]:
-    """
-    Read an external array (BooleanExternalArray, BooleanHdf5Array, DoubleHdf5Array, IntegerHdf5Array, StringExternalArray ...)
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    hdf5_path = get_hdf5_path_from_external_path(
-                external_path_obj=energyml_array,
-                path_in_root=path_in_root,
-                root_obj=root_obj,
-                epc=epc,
-    )
-    h5_reader = HDF5FileReader()
-    path_in_external = get_hdf_reference(energyml_array)[0]
-
-    result_array = h5_reader.read_array(hdf5_path, path_in_external)
-
-    if path_in_root.lower().endswith("points") and len(result_array) > 0 and len(result_array[0]) == 3:
-        crs = get_crs_obj(
-            context_obj=energyml_array,
-            path_in_root=path_in_root,
-            root_obj=root_obj,
-            epc=epc,
-        )
-        zincreasing_downward = is_z_reversed(crs)
-
-        if zincreasing_downward:
-            result_array = list(map(lambda p: [p[0], p[1], -p[2]], result_array))
-
-    return result_array
-
-
-
-def read_grid2d_patch(patch: Any, grid2d: Optional[Any] = None, path_in_root: Optional[str] = None, epc: Optional[Epc] = None) ‑> List -
-
-
-
- -Expand source code - -
def read_grid2d_patch(
-        patch: Any,
-        grid2d: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List:
-    points_path, points_obj = search_attribute_matching_name_with_path(patch, "Geometry.Points")[0]
-
-    return read_array(
-        energyml_array=points_obj,
-        root_obj=grid2d,
-        path_in_root=path_in_root + points_path,
-        epc=epc,
-    )
-
-
-
-def read_int_double_lattice_array(energyml_array: Any, root_obj: Optional[Any] = None, path_in_root: Optional[str] = None, epc: Optional[Epc] = None) -
-
-

Read DoubleLatticeArray or IntegerLatticeArray. -:param energyml_array: -:param root_obj: -:param path_in_root: -:param epc: -:return:

-
- -Expand source code - -
def read_int_double_lattice_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-):
-    """
-    Read DoubleLatticeArray or IntegerLatticeArray.
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    start_value = get_object_attribute_no_verif(energyml_array, "start_value")
-    offset = get_object_attribute_no_verif(energyml_array, "offset")
-
-    result = []
-
-    # if len(offset) == 1:
-    #     pass
-    # elif len(offset) == 2:
-    #     pass
-    # else:
-    raise Exception(f"{type(energyml_array)} read with an offset of length {len(offset)} is not supported")
-
-    # return result
-
-
-
-def read_jagged_array(energyml_array: Any, root_obj: Optional[Any] = None, path_in_root: Optional[str] = None, epc: Optional[Epc] = None) ‑> List[Any] -
-
-

Read a jagged array -:param energyml_array: -:param root_obj: -:param path_in_root: -:param epc: -:return:

-
- -Expand source code - -
def read_jagged_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List[Any]:
-    """
-    Read a jagged array
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    elements = read_array(
-        energyml_array=get_object_attribute_no_verif(energyml_array, "elements"),
-        root_obj=root_obj,
-        path_in_root=path_in_root + ".elements",
-        epc=epc,
-    )
-    cumulative_length = read_array(
-        energyml_array=read_array(get_object_attribute_no_verif(energyml_array, "cumulative_length")),
-        root_obj=root_obj,
-        path_in_root=path_in_root + ".cumulative_length",
-        epc=epc,
-    )
-
-    res = []
-    previous = 0
-    for cl in cumulative_length:
-        res.append(elements[previous: cl])
-        previous = cl
-    return res
-
-
-
-def read_point3d_from_representation_lattice_array(energyml_array: Any, root_obj: Optional[Any] = None, path_in_root: Optional[str] = None, epc: Optional[Epc] = None) -
-
-

Read a Point3DFromRepresentationLatticeArray.

-

Note: Only works for Grid2DRepresentation.

-

:param energyml_array: -:param root_obj: -:param path_in_root: -:param epc: -:return:

-
- -Expand source code - -
def read_point3d_from_representation_lattice_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-):
-    """
-    Read a Point3DFromRepresentationLatticeArray.
-
-    Note: Only works for Grid2DRepresentation.
-
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    supporting_rep_identifier = get_obj_identifier(get_object_attribute_no_verif(energyml_array, "supporting_representation"))
-    print(f"energyml_array : {energyml_array}\n\t{supporting_rep_identifier}")
-    supporting_rep = epc.get_object_by_identifier(supporting_rep_identifier)
-
-    # TODO chercher un pattern \.*patch\.*.[d]+ pour trouver le numero du patch dans le path_in_root puis lire le patch
-    # print(f"path_in_root {path_in_root}")
-
-    result = []
-    if "grid2d" in str(type(supporting_rep)).lower():
-        patch_path, patch = search_attribute_matching_name_with_path(supporting_rep, "Grid2dPatch")[0]
-        points = read_grid2d_patch(
-            patch=patch,
-            grid2d=supporting_rep,
-            path_in_root=patch_path,
-            epc=epc,
-        )
-        # TODO: take the points by there indices from the NodeIndicesOnSupportingRepresentation
-        result = points
-
-    else:
-        raise Exception(f"Not supported type {type(energyml_array)} for object {type(root_obj)}")
-    # pour trouver les infos qu'il faut
-    return result
-
-
-
-def read_point3d_lattice_array(energyml_array: Any, root_obj: Optional[Any] = None, path_in_root: Optional[str] = None, epc: Optional[Epc] = None) ‑> List -
-
-

Read a Point3DLatticeArray.

-

Note: If a CRS is found and its 'ZIncreasingDownward' is set to true or its

-

:param energyml_array: -:param root_obj: -:param path_in_root: -:param epc: -:return:

-
- -Expand source code - -
def read_point3d_lattice_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List:
-    """
-    Read a Point3DLatticeArray.
-
-    Note: If a CRS is found and its 'ZIncreasingDownward' is set to true or its
-
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    result = []
-    origin = _point_as_array(get_object_attribute_no_verif(energyml_array, "origin"))
-    offset = get_object_attribute_no_verif(energyml_array, "offset")
-
-    if len(offset) == 2:
-        slowest = offset[0]
-        fastest = offset[1]
-
-        crs_sa_count = search_attribute_in_upper_matching_name(
-            obj=energyml_array,
-            name_rgx="SlowestAxisCount",
-            root_obj=root_obj,
-            current_path=path_in_root,
-        )
-
-        crs_fa_count = search_attribute_in_upper_matching_name(
-            obj=energyml_array,
-            name_rgx="FastestAxisCount",
-            root_obj=root_obj,
-            current_path=path_in_root,
-        )
-
-        crs = get_crs_obj(
-            context_obj=energyml_array,
-            path_in_root=path_in_root,
-            root_obj=root_obj,
-            epc=epc,
-        )
-        zincreasing_downward = is_z_reversed(crs)
-
-        slowest_vec = _point_as_array(get_object_attribute_no_verif(slowest, "offset"))
-        slowest_spacing = read_array(get_object_attribute_no_verif(slowest, "spacing"))
-        slowest_table = list(map(lambda x: prod_n_tab(x, slowest_vec), slowest_spacing))
-
-        fastest_vec = _point_as_array(get_object_attribute_no_verif(fastest, "offset"))
-        fastest_spacing = read_array(get_object_attribute_no_verif(fastest, "spacing"))
-        fastest_table = list(map(lambda x: prod_n_tab(x, fastest_vec), fastest_spacing))
-
-        slowest_size = len(slowest_table)
-        fastest_size = len(fastest_table)
-
-        if len(crs_sa_count) > 0 and len(crs_fa_count) and crs_sa_count[0] == fastest_size:
-            print("reversing order")
-            # if offset were given in the wrong order
-            tmp_table = slowest_table
-            slowest_table = fastest_table
-            fastest_table = tmp_table
-
-            tmp_size = slowest_size
-            slowest_size = fastest_size
-            fastest_size = tmp_size
-
-        for i in range(slowest_size):
-            for j in range(fastest_size):
-                previous_value = origin
-                # to avoid a sum of the parts of the array at each iteration, I take the previous value in the same line
-                # number i and add the fastest_table[j] value
-
-                if j > 0:
-                    if i > 0:
-                        line_idx = i * fastest_size  # numero de ligne
-                        previous_value = result[line_idx + j - 1]
-                    else:
-                        previous_value = result[j - 1]
-                    if zincreasing_downward:
-                        result.append(sum_lists(previous_value, slowest_table[i - 1]))
-                    else:
-                        result.append(sum_lists(previous_value, fastest_table[j - 1]))
-                else:
-                    if i > 0:
-                        prev_line_idx = (i - 1) * fastest_size  # numero de ligne precedent
-                        previous_value = result[prev_line_idx]
-                        if zincreasing_downward:
-                            result.append(sum_lists(previous_value, fastest_table[j - 1]))
-                        else:
-                            result.append(sum_lists(previous_value, slowest_table[i - 1]))
-                    else:
-                        result.append(previous_value)
-    else:
-        raise Exception(f"{type(energyml_array)} read with an offset of length {len(offset)} is not supported")
-
-    return result
-
-
-
-def read_point3d_zvalue_array(energyml_array: Any, root_obj: Optional[Any] = None, path_in_root: Optional[str] = None, epc: Optional[Epc] = None) -
-
-

Read a Point3D2ValueArray -:param energyml_array: -:param root_obj: -:param path_in_root: -:param epc: -:return:

-
- -Expand source code - -
def read_point3d_zvalue_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-):
-    """
-    Read a Point3D2ValueArray
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    supporting_geometry = get_object_attribute_no_verif(energyml_array, "supporting_geometry")
-    sup_geom_array = read_array(
-        energyml_array=supporting_geometry,
-        root_obj=root_obj,
-        path_in_root=path_in_root + ".SupportingGeometry",
-        epc=epc,
-    )
-
-    zvalues = get_object_attribute_no_verif(energyml_array, "zvalues")
-    zvalues_array = flatten_concatenation(read_array(
-        energyml_array=zvalues,
-        root_obj=root_obj,
-        path_in_root=path_in_root + ".ZValues",
-        epc=epc,
-    ))
-
-    count = 0
-
-    for i in range(len(sup_geom_array)):
-        try:
-            sup_geom_array[i][2] = zvalues_array[i]
-        except Exception as e:
-            if count == 0:
-                print(e, f": {i} is out of bound of {len(zvalues_array)}")
-                count = count + 1
-
-    return sup_geom_array
-
-
-
-def read_xml_array(energyml_array: Any, root_obj: Optional[Any] = None, path_in_root: Optional[str] = None, epc: Optional[Epc] = None) ‑> List[Any] -
-
-

Read a xml array ( BooleanXmlArray, FloatingPointXmlArray, IntegerXmlArray, StringXmlArray …) -:param energyml_array: -:param root_obj: -:param path_in_root: -:param epc: -:return:

-
- -Expand source code - -
def read_xml_array(
-        energyml_array: Any,
-        root_obj: Optional[Any] = None,
-        path_in_root: Optional[str] = None,
-        epc: Optional[Epc] = None
-) -> List[Any]:
-    """
-    Read a xml array ( BooleanXmlArray, FloatingPointXmlArray, IntegerXmlArray, StringXmlArray ...)
-    :param energyml_array:
-    :param root_obj:
-    :param path_in_root:
-    :param epc:
-    :return:
-    """
-    values = get_object_attribute_no_verif(energyml_array, "values")
-    # count = get_object_attribute_no_verif(energyml_array, "count_per_value")
-    return values
-
-
-
-def sum_lists(l1: List, l2: List) -
-
-

Sums 2 lists values.

-

Example

-

[1,1,1] and [2,2,3,6] gives : [3,3,4,6]

-

:param l1: -:param l2: -:return:

-
- -Expand source code - -
def sum_lists(l1: List, l2: List):
-    """
-    Sums 2 lists values.
-
-    Example:
-        [1,1,1] and [2,2,3,6] gives : [3,3,4,6]
-
-    :param l1:
-    :param l2:
-    :return:
-    """
-    return [l1[i] + l2[i] for i in range(min(len(l1), len(l2)))]+max(l1, l2, key=len)[min(len(l1), len(l2)):]
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/energyml-utils/docs/src/energyml/utils/data/index.html b/energyml-utils/docs/src/energyml/utils/data/index.html deleted file mode 100644 index 1fb08b3..0000000 --- a/energyml-utils/docs/src/energyml/utils/data/index.html +++ /dev/null @@ -1,91 +0,0 @@ - - - - - - -src.energyml.utils.data API documentation - - - - - - - - - - - -
-
-
-

Module src.energyml.utils.data

-
-
-

The data module.

-

Contains functions to help the read of specific entities like Grid2DRepresentation, TriangulatedSetRepresentation etc. -It also contains functions to export data into OFF/OBJ format.

-
- -Expand source code - -
# Copyright (c) 2023-2024 Geosiris.
-# SPDX-License-Identifier: Apache-2.0
-"""
-The data module.
-
-Contains functions to help the read of specific entities like Grid2DRepresentation, TriangulatedSetRepresentation etc.
-It also contains functions to export data into OFF/OBJ format.
-"""
-
-
-
-

Sub-modules

-
-
src.energyml.utils.data.hdf
-
-
-
-
src.energyml.utils.data.helper
-
-
-
-
src.energyml.utils.data.mesh
-
-
-
-
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/energyml-utils/docs/src/energyml/utils/data/mesh.html b/energyml-utils/docs/src/energyml/utils/data/mesh.html deleted file mode 100644 index 6ae6e4b..0000000 --- a/energyml-utils/docs/src/energyml/utils/data/mesh.html +++ /dev/null @@ -1,1463 +0,0 @@ - - - - - - -src.energyml.utils.data.mesh API documentation - - - - - - - - - - - -
-
-
-

Module src.energyml.utils.data.mesh

-
-
-
- -Expand source code - -
# Copyright (c) 2023-2024 Geosiris.
-# SPDX-License-Identifier: Apache-2.0
-import inspect
-import re
-import sys
-from dataclasses import dataclass, field
-from io import BytesIO
-from typing import List, Optional, Any, Callable
-
-from .hdf import get_hdf_reference_with_path, \
-    get_hdf5_path_from_external_path, HDF5FileReader, get_crs_obj
-from .helper import read_array, read_grid2d_patch, is_z_reversed
-from ..epc import Epc, get_obj_identifier
-from ..introspection import search_attribute_matching_name, \
-    search_attribute_matching_type_with_path, \
-    search_attribute_matching_name_with_path, snake_case
-
-_FILE_HEADER: bytes = b"# file exported by energyml-utils python module (Geosiris)\n"
-
-Point = list[float]
-
-
-@dataclass
-class AbstractMesh:
-    energyml_object: Any = field(
-        default=None
-    )
-
-    crs_object: Any = field(
-        default=None
-    )
-
-    point_list: List[Point] = field(
-        default_factory=list,
-    )
-
-    identifier: str = field(
-        default=None,
-    )
-
-    def export_off(self, out: BytesIO) -> None:
-        pass
-
-    def get_nb_edges(self) -> int:
-        return 0
-
-    def get_nb_faces(self) -> int:
-        return 0
-
-    def get_indices(self) -> List[List[int]]:
-        return []
-
-
-@dataclass
-class PointSetMesh(AbstractMesh):
-    pass
-
-
-@dataclass
-class PolylineSetMesh(AbstractMesh):
-    line_indices: List[List[int]] = field(
-        default_factory=list,
-    )
-
-    def get_nb_edges(self) -> int:
-        return sum(list(map(lambda li: len(li) - 1, self.line_indices)))
-
-    def get_nb_faces(self) -> int:
-        return 0
-
-    def get_indices(self) -> List[List[int]]:
-        return self.line_indices
-
-
-@dataclass
-class SurfaceMesh(AbstractMesh):
-    faces_indices: List[List[int]] = field(
-        default_factory=list,
-    )
-
-    def get_nb_edges(self) -> int:
-        return sum(list(map(lambda li: len(li) - 1, self.faces_indices)))
-
-    def get_nb_faces(self) -> int:
-        return len(self.faces_indices)
-
-    def get_indices(self) -> List[List[int]]:
-        return self.faces_indices
-
-
-def get_mesh_reader_function(mesh_type_name: str) -> Optional[Callable]:
-    """
-    Returns the name of the potential appropriate function to read an object with type is named mesh_type_name
-    :param mesh_type_name: the initial type name
-    :return:
-    """
-    for name, obj in inspect.getmembers(sys.modules[__name__]):
-        if name == f"read_{snake_case(mesh_type_name)}":
-            return obj
-    return None
-
-
-def _mesh_name_mapping(array_type_name: str) -> str:
-    """
-    Transform the type name to match existing reader function
-    :param array_type_name:
-    :return:
-    """
-    array_type_name = array_type_name.replace("3D", "3d").replace("2D", "2d")
-    array_type_name = re.sub("^[Oo]bj([A-Z])", r"\1", array_type_name)
-    return array_type_name
-
-
-def read_mesh_object(
-        energyml_object: Any,
-        epc: Optional[Epc] = None
-) -> List[AbstractMesh]:
-    """
-    Read and "meshable" object. If :param:`energyml_object` is not supported, an exception will be raised.
-    :param energyml_object:
-    :param epc:
-    :return:
-    """
-    if isinstance(energyml_object, list):
-        return energyml_object
-    array_type_name = _mesh_name_mapping(type(energyml_object).__name__)
-
-    reader_func = get_mesh_reader_function(array_type_name)
-    if reader_func is not None:
-        return reader_func(
-            energyml_object=energyml_object,
-            epc=epc,
-        )
-    else:
-        print(f"Type {array_type_name} is not supported: function read_{snake_case(array_type_name)} not found")
-        raise Exception(f"Type {array_type_name} is not supported\n\t{energyml_object}: \n\tfunction read_{snake_case(array_type_name)} not found")
-
-
-def read_point_set_representation(energyml_object: Any, epc: Epc) -> List[PointSetMesh]:
-    # pt_geoms = search_attribute_matching_type(point_set, "AbstractGeometry")
-    h5_reader = HDF5FileReader()
-
-    meshes = []
-    for refer_path, refer_value in get_hdf_reference_with_path(energyml_object):
-        try:
-            hdf5_path = get_hdf5_path_from_external_path(
-                external_path_obj=refer_value,
-                path_in_root=refer_path,
-                root_obj=energyml_object,
-                epc=epc,
-            )
-            crs = get_crs_obj(
-                context_obj=refer_value,
-                path_in_root=refer_path,
-                root_obj=energyml_object,
-                epc=epc,
-            )
-            if hdf5_path is not None:
-                print(f"Reading h5 file : {hdf5_path}")
-                meshes.append(PointSetMesh(
-                    identifier=refer_value,
-                    energyml_object=energyml_object,
-                    crs_object=crs,
-                    point_list=h5_reader.read_array(hdf5_path, refer_value)
-                ))
-        except Exception as e:
-            print(f"Error with path {refer_path} -- {energyml_object}")
-            raise e
-    return meshes
-
-
-def read_polyline_set_representation(energyml_object: Any, epc: Epc) -> List[PointSetMesh]:
-    # pt_geoms = search_attribute_matching_type(point_set, "AbstractGeometry")
-    h5_reader = HDF5FileReader()
-
-    meshes = []
-
-    patch_idx = 0
-    for path_path_in_obj, patch in search_attribute_matching_name_with_path(energyml_object, "LinePatch"):
-        print(f"patch {patch}")
-        geometry_path_in_obj, geometry = search_attribute_matching_name_with_path(patch, "geometry")[0]
-        node_count_per_poly_path_in_obj, node_count_per_poly = \
-        search_attribute_matching_name_with_path(patch, "NodeCountPerPolyline")[0]
-        points_ext_array = search_attribute_matching_type_with_path(geometry, "ExternalDataArrayPart|Hdf5Dataset")
-        node_count_ext_array = search_attribute_matching_type_with_path(node_count_per_poly,
-                                                                        "ExternalDataArrayPart|Hdf5Dataset")
-
-        if len(points_ext_array) > 0:
-            point_per_elt = []
-            point_indices = []
-            crs = None
-
-            # Reading points
-            for patch_part_path, patchPart_value in points_ext_array:
-                patch_part_full_path_in_obj = path_path_in_obj + geometry_path_in_obj + patch_part_path
-                for refer_path, refer_value in get_hdf_reference_with_path(patchPart_value):
-                    print(f"refer_path {patch_part_full_path_in_obj}{refer_path} refer_value{refer_value} ")
-                    hdf5_path = get_hdf5_path_from_external_path(
-                        external_path_obj=refer_value,
-                        path_in_root=patch_part_full_path_in_obj + refer_path,
-                        root_obj=energyml_object,
-                        epc=epc,
-                    )
-                    crs = get_crs_obj(
-                        context_obj=refer_value,
-                        path_in_root=patch_part_full_path_in_obj + refer_path,
-                        root_obj=energyml_object,
-                        epc=epc,
-                    )
-                    if hdf5_path is not None:
-                        print(f"Reading h5 file : {hdf5_path}")
-                        point_per_elt = point_per_elt + h5_reader.read_array(hdf5_path, refer_value)
-
-            # Reading polyline indices
-            # for patch_part_path, patchPart_value in node_count_ext_array:
-            #     patch_part_full_path_in_obj = path_path_in_obj + node_count_per_poly_path_in_obj + patch_part_path
-            #     for refer_path, refer_value in get_hdf_reference_with_path(patchPart_value):
-            #         print(f"refer_path: {patch_part_full_path_in_obj}{refer_path} refer_value: {refer_value} ")
-            #         hdf5_path = get_hdf5_path_from_external_path(
-            #                     external_path_obj=refer_value,
-            #                     path_in_root=patch_part_full_path_in_obj + refer_path,
-            #                     root_obj=energyml_object,
-            #                     epc=epc,
-            #         )
-            #         if hdf5_path is not None:
-            #             node_counts_list = h5_reader.read_array(hdf5_path, refer_value)
-            #             idx = 0
-            #             for nb_node in node_counts_list:
-            #                 point_indices.append([x for x in range(idx, idx + nb_node)])
-            #                 idx = idx + nb_node
-
-            node_counts_list = read_array(
-                energyml_array=node_count_per_poly,
-                root_obj=energyml_object,
-                path_in_root=path_path_in_obj + node_count_per_poly_path_in_obj,
-                epc=epc,
-            )
-            idx = 0
-            for nb_node in node_counts_list:
-                point_indices.append([x for x in range(idx, idx + nb_node)])
-                idx = idx + nb_node
-
-            if len(point_per_elt) > 0:
-                # poly_idx = 0
-                # for single_poly_indices in point_indices:
-                meshes.append(PolylineSetMesh(
-                    # identifier=f"{get_obj_identifier(energyml_object)}_patch{patch_idx}_poly{poly_idx}",
-                    identifier=f"{get_obj_identifier(energyml_object)}_patch{patch_idx}",
-                    energyml_object=energyml_object,
-                    crs_object=crs,
-                    point_list=point_per_elt,
-                    line_indices=point_indices
-                ))
-                # poly_idx = poly_idx + 1
-        patch_idx = patch_idx + 1
-
-    return meshes
-
-
-def read_grid2d_representation(energyml_object: Any, epc: Epc, keep_holes=False) -> List[SurfaceMesh]:
-    # h5_reader = HDF5FileReader()
-    meshes = []
-
-    patch_idx = 0
-    for patch_path, patch in search_attribute_matching_name_with_path(energyml_object, "Grid2dPatch"):
-        crs = get_crs_obj(
-            context_obj=patch,
-            path_in_root=patch_path,
-            root_obj=energyml_object,
-            epc=epc,
-        )
-
-        reverse_z_values = is_z_reversed(crs)
-
-        points = read_grid2d_patch(
-            patch=patch,
-            grid2d=energyml_object,
-            path_in_root=patch_path,
-            epc=epc,
-        )
-
-        fa_count = search_attribute_matching_name(patch, "FastestAxisCount")
-        if fa_count is None:
-            fa_count = search_attribute_matching_name(energyml_object, "FastestAxisCount")
-
-        sa_count = search_attribute_matching_name(patch, "SlowestAxisCount")
-        if sa_count is None:
-            sa_count = search_attribute_matching_name(energyml_object, "SlowestAxisCount")
-
-        fa_count = fa_count[0]
-        sa_count = sa_count[0]
-
-        print(f"sa_count {sa_count} fa_count {fa_count}")
-
-        points_no_nan = []
-
-        indice_to_final_indice = {}
-        if keep_holes:
-            for i in range(len(points)):
-                p = points[i]
-                if p[2] != p[2]:  # a NaN
-                    points[i][2] = 0
-                elif reverse_z_values:
-                    points[i][2] = - points[i][2]
-        else:
-            for i in range(len(points)):
-                p = points[i]
-                if p[2] == p[2]:  # not a NaN
-                    if reverse_z_values:
-                        points[i][2] = - points[i][2]
-                    indice_to_final_indice[i] = len(points_no_nan)
-                    points_no_nan.append(p)
-
-        indices = []
-
-        while sa_count*fa_count > len(points):
-            sa_count = sa_count - 1
-            fa_count = fa_count - 1
-
-        while sa_count*fa_count < len(points):
-            sa_count = sa_count + 1
-            fa_count = fa_count + 1
-
-        print(f"sa_count {sa_count} fa_count {fa_count} : {sa_count*fa_count} - {len(points)} ")
-
-        for sa in range(sa_count-1):
-            for fa in range(fa_count-1):
-                line = sa * fa_count
-                if sa+1 == int(sa_count / 2) and fa == int(fa_count / 2):
-                    print(
-                        "\n\t", (line + fa), " : ", (line + fa) in indice_to_final_indice,
-                        "\n\t", (line + fa + 1), " : ", (line + fa + 1) in indice_to_final_indice,
-                        "\n\t", (line + fa_count + fa + 1), " : ", (line + fa_count + fa + 1) in indice_to_final_indice,
-                        "\n\t", (line + fa_count + fa), " : ", (line + fa_count + fa) in indice_to_final_indice,
-                    )
-                if keep_holes:
-                    indices.append(
-                        [
-                            line + fa,
-                            line + fa + 1,
-                            line + fa_count + fa + 1,
-                            line + fa_count + fa,
-                        ]
-                    )
-                elif (
-                    (line + fa) in indice_to_final_indice
-                    and (line + fa + 1) in indice_to_final_indice
-                    and (line + fa_count + fa + 1) in indice_to_final_indice
-                    and (line + fa_count + fa) in indice_to_final_indice
-                ):
-                    indices.append(
-                        [
-                            indice_to_final_indice[line + fa],
-                            indice_to_final_indice[line + fa + 1],
-                            indice_to_final_indice[line + fa_count + fa + 1],
-                            indice_to_final_indice[line + fa_count + fa],
-                        ]
-                    )
-        # print(indices)
-        meshes.append(SurfaceMesh(
-            identifier=f"{get_obj_identifier(energyml_object)}_patch{patch_idx}",
-            energyml_object=energyml_object,
-            crs_object=None,
-            point_list=points if keep_holes else points_no_nan,
-            faces_indices=indices
-        ))
-        patch_idx = patch_idx + 1
-
-    return meshes
-
-
-def read_triangulated_set_representation(energyml_object: Any, epc: Epc) -> List[SurfaceMesh]:
-    meshes = []
-
-    point_offset = 0
-    patch_idx = 0
-    for patch_path, patch in search_attribute_matching_name_with_path(energyml_object, "\\.*Patch"):
-        crs = get_crs_obj(
-            context_obj=patch,
-            path_in_root=patch_path,
-            root_obj=energyml_object,
-            epc=epc,
-        )
-
-        point_list: List[Point] = []
-        for point_path, point_obj in search_attribute_matching_name_with_path(patch, "Geometry.Points"):
-            point_list = point_list + read_array(
-                energyml_array=point_obj,
-                root_obj=energyml_object,
-                path_in_root=patch_path + point_path,
-                epc=epc,
-            )
-
-        triangles_list: List[List[int]] = []
-        for triangles_path, triangles_obj in search_attribute_matching_name_with_path(patch, "Triangles"):
-            triangles_list = triangles_list + read_array(
-                energyml_array=triangles_obj,
-                root_obj=energyml_object,
-                path_in_root=patch_path + triangles_path,
-                epc=epc,
-            )
-        triangles_list = list(map(lambda tr: [ti - point_offset for ti in tr], triangles_list))
-        meshes.append(SurfaceMesh(
-            identifier=f"{get_obj_identifier(energyml_object)}_patch{patch_idx}",
-            energyml_object=energyml_object,
-            crs_object=crs,
-            point_list=point_list,
-            faces_indices=triangles_list
-        ))
-
-        point_offset = point_offset + len(point_list)
-
-    return meshes
-
-
-# MESH FILES
-
-
-def export_off(mesh_list: List[AbstractMesh], out: BytesIO):
-    """
-    Export an :class:`AbstractMesh` into off format.
-    :param mesh_list:
-    :param out:
-    :return:
-    """
-    nb_points = sum(list(map(lambda m: len(m.point_list), mesh_list)))
-    nb_edges = sum(list(map(lambda m: m.get_nb_edges(), mesh_list)))
-    nb_faces = sum(list(map(lambda m: m.get_nb_faces(), mesh_list)))
-
-    out.write(b"OFF\n")
-    out.write(_FILE_HEADER)
-    out.write(f"{nb_points} {nb_faces} {nb_edges}\n".encode('utf-8'))
-
-    points_io = BytesIO()
-    faces_io = BytesIO()
-
-    point_offset = 0
-    for m in mesh_list:
-        export_off_part(
-            off_point_part=points_io,
-            off_face_part=faces_io,
-            points=m.point_list,
-            indices=m.get_indices(),
-            point_offset=point_offset,
-            colors=[],
-        )
-        point_offset = point_offset + len(m.point_list)
-
-    out.write(points_io.getbuffer())
-    out.write(faces_io.getbuffer())
-
-
-def export_off_part(
-        off_point_part: BytesIO,
-        off_face_part: BytesIO,
-        points: List[List[float]],
-        indices: List[List[int]],
-        point_offset: Optional[int] = 0,
-        colors: Optional[List[List[int]]] = None
-) -> None:
-    for p in points:
-        for pi in p:
-            off_point_part.write(f"{pi} ".encode('utf-8'))
-        off_point_part.write(b"\n")
-
-    cpt = 0
-    for face in indices:
-        if len(face) > 1:
-            off_face_part.write(f"{len(face)} ".encode('utf-8'))
-            for pi in face:
-                off_face_part.write(f"{pi + point_offset} ".encode('utf-8'))
-
-            if colors is not None and len(colors) > cpt and colors[cpt] is not None and len(colors[cpt]) > 0:
-                for col in colors[cpt]:
-                    off_face_part.write(f"{col} ".encode('utf-8'))
-
-            off_face_part.write(b"\n")
-
-
-def export_obj(mesh_list: List[AbstractMesh], out: BytesIO, obj_name: Optional[str] = None):
-    """
-    Export an :class:`AbstractMesh` into obj format.
-
-    Each AbstractMesh from the list :param:`mesh_list` will be placed into its own group.
-    :param mesh_list:
-    :param out:
-    :param obj_name:
-    :return:
-    """
-    out.write(f"# Generated by energyml-utils a Geosiris python module\n\n".encode('utf-8'))
-
-    if obj_name is not None:
-        out.write(f"o {obj_name}\n\n".encode('utf-8'))
-
-    point_offset = 0
-    for m in mesh_list:
-        out.write(f"g {m.identifier}\n\n".encode('utf-8'))
-        _export_obj_elt(
-            off_point_part=out,
-            off_face_part=out,
-            points=m.point_list,
-            indices=m.get_indices(),
-            point_offset=point_offset,
-            colors=[],
-            elt_letter="l" if isinstance(m, PolylineSetMesh) else "f"
-        )
-        point_offset = point_offset + len(m.point_list)
-        out.write("\n".encode('utf-8'))
-
-
-def _export_obj_elt(
-        off_point_part: BytesIO,
-        off_face_part: BytesIO,
-        points: List[List[float]],
-        indices: List[List[int]],
-        point_offset: Optional[int] = 0,
-        colors: Optional[List[List[int]]] = None,
-        elt_letter: str = "f",
-) -> None:
-    """
-
-    :param off_point_part:
-    :param off_face_part:
-    :param points:
-    :param indices:
-    :param point_offset:
-    :param colors: currently not supported
-    :param elt_letter: "l" for line and "f" for faces
-    :return:
-    """
-    offset_obj = 1  # OBJ point indices starts at 1 not 0
-    for p in points:
-        if len(p) > 0:
-            off_point_part.write(f"v {' '.join(list(map(lambda xyz: str(xyz), p)))}\n".encode('utf-8'))
-
-    # cpt = 0
-    for face in indices:
-        if len(face) > 1:
-            # off_face_part.write(f"{elt_letter} ".encode('utf-8'))
-            # for pi in face:
-            #     off_face_part.write(f"{pi + point_offset} ".encode('utf-8'))
-            off_point_part.write(
-                f"{elt_letter} {' '.join(list(map(lambda x: str(x + point_offset + offset_obj), face)))}\n".encode(
-                    'utf-8'))
-
-            # if colors is not None and len(colors) > cpt and colors[cpt] is not None and len(colors[cpt]) > 0:
-            #     for col in colors[cpt]:
-            #         off_face_part.write(f"{col} ".encode('utf-8'))
-
-            # off_face_part.write(b"\n")
-
-
-
-
-
-
-
-

Functions

-
-
-def export_obj(mesh_list: List[AbstractMesh], out: _io.BytesIO, obj_name: Optional[str] = None) -
-
-

Export an :class:AbstractMesh into obj format.

-

Each AbstractMesh from the list :param:mesh_list will be placed into its own group. -:param mesh_list: -:param out: -:param obj_name: -:return:

-
- -Expand source code - -
def export_obj(mesh_list: List[AbstractMesh], out: BytesIO, obj_name: Optional[str] = None):
-    """
-    Export an :class:`AbstractMesh` into obj format.
-
-    Each AbstractMesh from the list :param:`mesh_list` will be placed into its own group.
-    :param mesh_list:
-    :param out:
-    :param obj_name:
-    :return:
-    """
-    out.write(f"# Generated by energyml-utils a Geosiris python module\n\n".encode('utf-8'))
-
-    if obj_name is not None:
-        out.write(f"o {obj_name}\n\n".encode('utf-8'))
-
-    point_offset = 0
-    for m in mesh_list:
-        out.write(f"g {m.identifier}\n\n".encode('utf-8'))
-        _export_obj_elt(
-            off_point_part=out,
-            off_face_part=out,
-            points=m.point_list,
-            indices=m.get_indices(),
-            point_offset=point_offset,
-            colors=[],
-            elt_letter="l" if isinstance(m, PolylineSetMesh) else "f"
-        )
-        point_offset = point_offset + len(m.point_list)
-        out.write("\n".encode('utf-8'))
-
-
-
-def export_off(mesh_list: List[AbstractMesh], out: _io.BytesIO) -
-
-

Export an :class:AbstractMesh into off format. -:param mesh_list: -:param out: -:return:

-
- -Expand source code - -
def export_off(mesh_list: List[AbstractMesh], out: BytesIO):
-    """
-    Export an :class:`AbstractMesh` into off format.
-    :param mesh_list:
-    :param out:
-    :return:
-    """
-    nb_points = sum(list(map(lambda m: len(m.point_list), mesh_list)))
-    nb_edges = sum(list(map(lambda m: m.get_nb_edges(), mesh_list)))
-    nb_faces = sum(list(map(lambda m: m.get_nb_faces(), mesh_list)))
-
-    out.write(b"OFF\n")
-    out.write(_FILE_HEADER)
-    out.write(f"{nb_points} {nb_faces} {nb_edges}\n".encode('utf-8'))
-
-    points_io = BytesIO()
-    faces_io = BytesIO()
-
-    point_offset = 0
-    for m in mesh_list:
-        export_off_part(
-            off_point_part=points_io,
-            off_face_part=faces_io,
-            points=m.point_list,
-            indices=m.get_indices(),
-            point_offset=point_offset,
-            colors=[],
-        )
-        point_offset = point_offset + len(m.point_list)
-
-    out.write(points_io.getbuffer())
-    out.write(faces_io.getbuffer())
-
-
-
-def export_off_part(off_point_part: _io.BytesIO, off_face_part: _io.BytesIO, points: List[List[float]], indices: List[List[int]], point_offset: Optional[int] = 0, colors: Optional[List[List[int]]] = None) ‑> None -
-
-
-
- -Expand source code - -
def export_off_part(
-        off_point_part: BytesIO,
-        off_face_part: BytesIO,
-        points: List[List[float]],
-        indices: List[List[int]],
-        point_offset: Optional[int] = 0,
-        colors: Optional[List[List[int]]] = None
-) -> None:
-    for p in points:
-        for pi in p:
-            off_point_part.write(f"{pi} ".encode('utf-8'))
-        off_point_part.write(b"\n")
-
-    cpt = 0
-    for face in indices:
-        if len(face) > 1:
-            off_face_part.write(f"{len(face)} ".encode('utf-8'))
-            for pi in face:
-                off_face_part.write(f"{pi + point_offset} ".encode('utf-8'))
-
-            if colors is not None and len(colors) > cpt and colors[cpt] is not None and len(colors[cpt]) > 0:
-                for col in colors[cpt]:
-                    off_face_part.write(f"{col} ".encode('utf-8'))
-
-            off_face_part.write(b"\n")
-
-
-
-def get_mesh_reader_function(mesh_type_name: str) ‑> Optional[Callable] -
-
-

Returns the name of the potential appropriate function to read an object with type is named mesh_type_name -:param mesh_type_name: the initial type name -:return:

-
- -Expand source code - -
def get_mesh_reader_function(mesh_type_name: str) -> Optional[Callable]:
-    """
-    Returns the name of the potential appropriate function to read an object with type is named mesh_type_name
-    :param mesh_type_name: the initial type name
-    :return:
-    """
-    for name, obj in inspect.getmembers(sys.modules[__name__]):
-        if name == f"read_{snake_case(mesh_type_name)}":
-            return obj
-    return None
-
-
-
-def read_grid2d_representation(energyml_object: Any, epc: Epc, keep_holes=False) ‑> List[SurfaceMesh] -
-
-
-
- -Expand source code - -
def read_grid2d_representation(energyml_object: Any, epc: Epc, keep_holes=False) -> List[SurfaceMesh]:
-    # h5_reader = HDF5FileReader()
-    meshes = []
-
-    patch_idx = 0
-    for patch_path, patch in search_attribute_matching_name_with_path(energyml_object, "Grid2dPatch"):
-        crs = get_crs_obj(
-            context_obj=patch,
-            path_in_root=patch_path,
-            root_obj=energyml_object,
-            epc=epc,
-        )
-
-        reverse_z_values = is_z_reversed(crs)
-
-        points = read_grid2d_patch(
-            patch=patch,
-            grid2d=energyml_object,
-            path_in_root=patch_path,
-            epc=epc,
-        )
-
-        fa_count = search_attribute_matching_name(patch, "FastestAxisCount")
-        if fa_count is None:
-            fa_count = search_attribute_matching_name(energyml_object, "FastestAxisCount")
-
-        sa_count = search_attribute_matching_name(patch, "SlowestAxisCount")
-        if sa_count is None:
-            sa_count = search_attribute_matching_name(energyml_object, "SlowestAxisCount")
-
-        fa_count = fa_count[0]
-        sa_count = sa_count[0]
-
-        print(f"sa_count {sa_count} fa_count {fa_count}")
-
-        points_no_nan = []
-
-        indice_to_final_indice = {}
-        if keep_holes:
-            for i in range(len(points)):
-                p = points[i]
-                if p[2] != p[2]:  # a NaN
-                    points[i][2] = 0
-                elif reverse_z_values:
-                    points[i][2] = - points[i][2]
-        else:
-            for i in range(len(points)):
-                p = points[i]
-                if p[2] == p[2]:  # not a NaN
-                    if reverse_z_values:
-                        points[i][2] = - points[i][2]
-                    indice_to_final_indice[i] = len(points_no_nan)
-                    points_no_nan.append(p)
-
-        indices = []
-
-        while sa_count*fa_count > len(points):
-            sa_count = sa_count - 1
-            fa_count = fa_count - 1
-
-        while sa_count*fa_count < len(points):
-            sa_count = sa_count + 1
-            fa_count = fa_count + 1
-
-        print(f"sa_count {sa_count} fa_count {fa_count} : {sa_count*fa_count} - {len(points)} ")
-
-        for sa in range(sa_count-1):
-            for fa in range(fa_count-1):
-                line = sa * fa_count
-                if sa+1 == int(sa_count / 2) and fa == int(fa_count / 2):
-                    print(
-                        "\n\t", (line + fa), " : ", (line + fa) in indice_to_final_indice,
-                        "\n\t", (line + fa + 1), " : ", (line + fa + 1) in indice_to_final_indice,
-                        "\n\t", (line + fa_count + fa + 1), " : ", (line + fa_count + fa + 1) in indice_to_final_indice,
-                        "\n\t", (line + fa_count + fa), " : ", (line + fa_count + fa) in indice_to_final_indice,
-                    )
-                if keep_holes:
-                    indices.append(
-                        [
-                            line + fa,
-                            line + fa + 1,
-                            line + fa_count + fa + 1,
-                            line + fa_count + fa,
-                        ]
-                    )
-                elif (
-                    (line + fa) in indice_to_final_indice
-                    and (line + fa + 1) in indice_to_final_indice
-                    and (line + fa_count + fa + 1) in indice_to_final_indice
-                    and (line + fa_count + fa) in indice_to_final_indice
-                ):
-                    indices.append(
-                        [
-                            indice_to_final_indice[line + fa],
-                            indice_to_final_indice[line + fa + 1],
-                            indice_to_final_indice[line + fa_count + fa + 1],
-                            indice_to_final_indice[line + fa_count + fa],
-                        ]
-                    )
-        # print(indices)
-        meshes.append(SurfaceMesh(
-            identifier=f"{get_obj_identifier(energyml_object)}_patch{patch_idx}",
-            energyml_object=energyml_object,
-            crs_object=None,
-            point_list=points if keep_holes else points_no_nan,
-            faces_indices=indices
-        ))
-        patch_idx = patch_idx + 1
-
-    return meshes
-
-
-
-def read_mesh_object(energyml_object: Any, epc: Optional[Epc] = None) ‑> List[AbstractMesh] -
-
-

Read and "meshable" object. If :param:energyml_object is not supported, an exception will be raised. -:param energyml_object: -:param epc: -:return:

-
- -Expand source code - -
def read_mesh_object(
-        energyml_object: Any,
-        epc: Optional[Epc] = None
-) -> List[AbstractMesh]:
-    """
-    Read and "meshable" object. If :param:`energyml_object` is not supported, an exception will be raised.
-    :param energyml_object:
-    :param epc:
-    :return:
-    """
-    if isinstance(energyml_object, list):
-        return energyml_object
-    array_type_name = _mesh_name_mapping(type(energyml_object).__name__)
-
-    reader_func = get_mesh_reader_function(array_type_name)
-    if reader_func is not None:
-        return reader_func(
-            energyml_object=energyml_object,
-            epc=epc,
-        )
-    else:
-        print(f"Type {array_type_name} is not supported: function read_{snake_case(array_type_name)} not found")
-        raise Exception(f"Type {array_type_name} is not supported\n\t{energyml_object}: \n\tfunction read_{snake_case(array_type_name)} not found")
-
-
-
-def read_point_set_representation(energyml_object: Any, epc: Epc) ‑> List[PointSetMesh] -
-
-
-
- -Expand source code - -
def read_point_set_representation(energyml_object: Any, epc: Epc) -> List[PointSetMesh]:
-    # pt_geoms = search_attribute_matching_type(point_set, "AbstractGeometry")
-    h5_reader = HDF5FileReader()
-
-    meshes = []
-    for refer_path, refer_value in get_hdf_reference_with_path(energyml_object):
-        try:
-            hdf5_path = get_hdf5_path_from_external_path(
-                external_path_obj=refer_value,
-                path_in_root=refer_path,
-                root_obj=energyml_object,
-                epc=epc,
-            )
-            crs = get_crs_obj(
-                context_obj=refer_value,
-                path_in_root=refer_path,
-                root_obj=energyml_object,
-                epc=epc,
-            )
-            if hdf5_path is not None:
-                print(f"Reading h5 file : {hdf5_path}")
-                meshes.append(PointSetMesh(
-                    identifier=refer_value,
-                    energyml_object=energyml_object,
-                    crs_object=crs,
-                    point_list=h5_reader.read_array(hdf5_path, refer_value)
-                ))
-        except Exception as e:
-            print(f"Error with path {refer_path} -- {energyml_object}")
-            raise e
-    return meshes
-
-
-
-def read_polyline_set_representation(energyml_object: Any, epc: Epc) ‑> List[PointSetMesh] -
-
-
-
- -Expand source code - -
def read_polyline_set_representation(energyml_object: Any, epc: Epc) -> List[PointSetMesh]:
-    # pt_geoms = search_attribute_matching_type(point_set, "AbstractGeometry")
-    h5_reader = HDF5FileReader()
-
-    meshes = []
-
-    patch_idx = 0
-    for path_path_in_obj, patch in search_attribute_matching_name_with_path(energyml_object, "LinePatch"):
-        print(f"patch {patch}")
-        geometry_path_in_obj, geometry = search_attribute_matching_name_with_path(patch, "geometry")[0]
-        node_count_per_poly_path_in_obj, node_count_per_poly = \
-        search_attribute_matching_name_with_path(patch, "NodeCountPerPolyline")[0]
-        points_ext_array = search_attribute_matching_type_with_path(geometry, "ExternalDataArrayPart|Hdf5Dataset")
-        node_count_ext_array = search_attribute_matching_type_with_path(node_count_per_poly,
-                                                                        "ExternalDataArrayPart|Hdf5Dataset")
-
-        if len(points_ext_array) > 0:
-            point_per_elt = []
-            point_indices = []
-            crs = None
-
-            # Reading points
-            for patch_part_path, patchPart_value in points_ext_array:
-                patch_part_full_path_in_obj = path_path_in_obj + geometry_path_in_obj + patch_part_path
-                for refer_path, refer_value in get_hdf_reference_with_path(patchPart_value):
-                    print(f"refer_path {patch_part_full_path_in_obj}{refer_path} refer_value{refer_value} ")
-                    hdf5_path = get_hdf5_path_from_external_path(
-                        external_path_obj=refer_value,
-                        path_in_root=patch_part_full_path_in_obj + refer_path,
-                        root_obj=energyml_object,
-                        epc=epc,
-                    )
-                    crs = get_crs_obj(
-                        context_obj=refer_value,
-                        path_in_root=patch_part_full_path_in_obj + refer_path,
-                        root_obj=energyml_object,
-                        epc=epc,
-                    )
-                    if hdf5_path is not None:
-                        print(f"Reading h5 file : {hdf5_path}")
-                        point_per_elt = point_per_elt + h5_reader.read_array(hdf5_path, refer_value)
-
-            # Reading polyline indices
-            # for patch_part_path, patchPart_value in node_count_ext_array:
-            #     patch_part_full_path_in_obj = path_path_in_obj + node_count_per_poly_path_in_obj + patch_part_path
-            #     for refer_path, refer_value in get_hdf_reference_with_path(patchPart_value):
-            #         print(f"refer_path: {patch_part_full_path_in_obj}{refer_path} refer_value: {refer_value} ")
-            #         hdf5_path = get_hdf5_path_from_external_path(
-            #                     external_path_obj=refer_value,
-            #                     path_in_root=patch_part_full_path_in_obj + refer_path,
-            #                     root_obj=energyml_object,
-            #                     epc=epc,
-            #         )
-            #         if hdf5_path is not None:
-            #             node_counts_list = h5_reader.read_array(hdf5_path, refer_value)
-            #             idx = 0
-            #             for nb_node in node_counts_list:
-            #                 point_indices.append([x for x in range(idx, idx + nb_node)])
-            #                 idx = idx + nb_node
-
-            node_counts_list = read_array(
-                energyml_array=node_count_per_poly,
-                root_obj=energyml_object,
-                path_in_root=path_path_in_obj + node_count_per_poly_path_in_obj,
-                epc=epc,
-            )
-            idx = 0
-            for nb_node in node_counts_list:
-                point_indices.append([x for x in range(idx, idx + nb_node)])
-                idx = idx + nb_node
-
-            if len(point_per_elt) > 0:
-                # poly_idx = 0
-                # for single_poly_indices in point_indices:
-                meshes.append(PolylineSetMesh(
-                    # identifier=f"{get_obj_identifier(energyml_object)}_patch{patch_idx}_poly{poly_idx}",
-                    identifier=f"{get_obj_identifier(energyml_object)}_patch{patch_idx}",
-                    energyml_object=energyml_object,
-                    crs_object=crs,
-                    point_list=point_per_elt,
-                    line_indices=point_indices
-                ))
-                # poly_idx = poly_idx + 1
-        patch_idx = patch_idx + 1
-
-    return meshes
-
-
-
-def read_triangulated_set_representation(energyml_object: Any, epc: Epc) ‑> List[SurfaceMesh] -
-
-
-
- -Expand source code - -
def read_triangulated_set_representation(energyml_object: Any, epc: Epc) -> List[SurfaceMesh]:
-    meshes = []
-
-    point_offset = 0
-    patch_idx = 0
-    for patch_path, patch in search_attribute_matching_name_with_path(energyml_object, "\\.*Patch"):
-        crs = get_crs_obj(
-            context_obj=patch,
-            path_in_root=patch_path,
-            root_obj=energyml_object,
-            epc=epc,
-        )
-
-        point_list: List[Point] = []
-        for point_path, point_obj in search_attribute_matching_name_with_path(patch, "Geometry.Points"):
-            point_list = point_list + read_array(
-                energyml_array=point_obj,
-                root_obj=energyml_object,
-                path_in_root=patch_path + point_path,
-                epc=epc,
-            )
-
-        triangles_list: List[List[int]] = []
-        for triangles_path, triangles_obj in search_attribute_matching_name_with_path(patch, "Triangles"):
-            triangles_list = triangles_list + read_array(
-                energyml_array=triangles_obj,
-                root_obj=energyml_object,
-                path_in_root=patch_path + triangles_path,
-                epc=epc,
-            )
-        triangles_list = list(map(lambda tr: [ti - point_offset for ti in tr], triangles_list))
-        meshes.append(SurfaceMesh(
-            identifier=f"{get_obj_identifier(energyml_object)}_patch{patch_idx}",
-            energyml_object=energyml_object,
-            crs_object=crs,
-            point_list=point_list,
-            faces_indices=triangles_list
-        ))
-
-        point_offset = point_offset + len(point_list)
-
-    return meshes
-
-
-
-
-
-

Classes

-
-
-class AbstractMesh -(energyml_object: Any = None, crs_object: Any = None, point_list: List[list[float]] = <factory>, identifier: str = None) -
-
-

AbstractMesh(energyml_object: Any = None, crs_object: Any = None, point_list: List[list[float]] = , identifier: str = None)

-
- -Expand source code - -
@dataclass
-class AbstractMesh:
-    energyml_object: Any = field(
-        default=None
-    )
-
-    crs_object: Any = field(
-        default=None
-    )
-
-    point_list: List[Point] = field(
-        default_factory=list,
-    )
-
-    identifier: str = field(
-        default=None,
-    )
-
-    def export_off(self, out: BytesIO) -> None:
-        pass
-
-    def get_nb_edges(self) -> int:
-        return 0
-
-    def get_nb_faces(self) -> int:
-        return 0
-
-    def get_indices(self) -> List[List[int]]:
-        return []
-
-

Subclasses

- -

Class variables

-
-
var crs_object : Any
-
-
-
-
var energyml_object : Any
-
-
-
-
var identifier : str
-
-
-
-
var point_list : List[list[float]]
-
-
-
-
-

Methods

-
-
-def export_off(self, out: _io.BytesIO) ‑> None -
-
-
-
- -Expand source code - -
def export_off(self, out: BytesIO) -> None:
-    pass
-
-
-
-def get_indices(self) ‑> List[List[int]] -
-
-
-
- -Expand source code - -
def get_indices(self) -> List[List[int]]:
-    return []
-
-
-
-def get_nb_edges(self) ‑> int -
-
-
-
- -Expand source code - -
def get_nb_edges(self) -> int:
-    return 0
-
-
-
-def get_nb_faces(self) ‑> int -
-
-
-
- -Expand source code - -
def get_nb_faces(self) -> int:
-    return 0
-
-
-
-
-
-class PointSetMesh -(energyml_object: Any = None, crs_object: Any = None, point_list: List[list[float]] = <factory>, identifier: str = None) -
-
-

PointSetMesh(energyml_object: Any = None, crs_object: Any = None, point_list: List[list[float]] = , identifier: str = None)

-
- -Expand source code - -
@dataclass
-class PointSetMesh(AbstractMesh):
-    pass
-
-

Ancestors

- -
-
-class PolylineSetMesh -(energyml_object: Any = None, crs_object: Any = None, point_list: List[list[float]] = <factory>, identifier: str = None, line_indices: List[List[int]] = <factory>) -
-
-

PolylineSetMesh(energyml_object: Any = None, crs_object: Any = None, point_list: List[list[float]] = , identifier: str = None, line_indices: List[List[int]] = )

-
- -Expand source code - -
@dataclass
-class PolylineSetMesh(AbstractMesh):
-    line_indices: List[List[int]] = field(
-        default_factory=list,
-    )
-
-    def get_nb_edges(self) -> int:
-        return sum(list(map(lambda li: len(li) - 1, self.line_indices)))
-
-    def get_nb_faces(self) -> int:
-        return 0
-
-    def get_indices(self) -> List[List[int]]:
-        return self.line_indices
-
-

Ancestors

- -

Class variables

-
-
var line_indices : List[List[int]]
-
-
-
-
-

Methods

-
-
-def get_indices(self) ‑> List[List[int]] -
-
-
-
- -Expand source code - -
def get_indices(self) -> List[List[int]]:
-    return self.line_indices
-
-
-
-def get_nb_edges(self) ‑> int -
-
-
-
- -Expand source code - -
def get_nb_edges(self) -> int:
-    return sum(list(map(lambda li: len(li) - 1, self.line_indices)))
-
-
-
-def get_nb_faces(self) ‑> int -
-
-
-
- -Expand source code - -
def get_nb_faces(self) -> int:
-    return 0
-
-
-
-
-
-class SurfaceMesh -(energyml_object: Any = None, crs_object: Any = None, point_list: List[list[float]] = <factory>, identifier: str = None, faces_indices: List[List[int]] = <factory>) -
-
-

SurfaceMesh(energyml_object: Any = None, crs_object: Any = None, point_list: List[list[float]] = , identifier: str = None, faces_indices: List[List[int]] = )

-
- -Expand source code - -
@dataclass
-class SurfaceMesh(AbstractMesh):
-    faces_indices: List[List[int]] = field(
-        default_factory=list,
-    )
-
-    def get_nb_edges(self) -> int:
-        return sum(list(map(lambda li: len(li) - 1, self.faces_indices)))
-
-    def get_nb_faces(self) -> int:
-        return len(self.faces_indices)
-
-    def get_indices(self) -> List[List[int]]:
-        return self.faces_indices
-
-

Ancestors

- -

Class variables

-
-
var faces_indices : List[List[int]]
-
-
-
-
-

Methods

-
-
-def get_indices(self) ‑> List[List[int]] -
-
-
-
- -Expand source code - -
def get_indices(self) -> List[List[int]]:
-    return self.faces_indices
-
-
-
-def get_nb_edges(self) ‑> int -
-
-
-
- -Expand source code - -
def get_nb_edges(self) -> int:
-    return sum(list(map(lambda li: len(li) - 1, self.faces_indices)))
-
-
-
-def get_nb_faces(self) ‑> int -
-
-
-
- -Expand source code - -
def get_nb_faces(self) -> int:
-    return len(self.faces_indices)
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/energyml-utils/docs/src/energyml/utils/epc.html b/energyml-utils/docs/src/energyml/utils/epc.html deleted file mode 100644 index b0204a4..0000000 --- a/energyml-utils/docs/src/energyml/utils/epc.html +++ /dev/null @@ -1,1900 +0,0 @@ - - - - - - -src.energyml.utils.epc API documentation - - - - - - - - - - - -
-
-
-

Module src.energyml.utils.epc

-
-
-

This example module shows various types of documentation available for use -with pydoc. -To generate HTML documentation for this module issue the -command:

-
pydoc -w foo
-
-
- -Expand source code - -
# Copyright (c) 2023-2024 Geosiris.
-# SPDX-License-Identifier: Apache-2.0
-"""
-This example module shows various types of documentation available for use
-with pydoc.  To generate HTML documentation for this module issue the
-command:
-
-    pydoc -w foo
-
-"""
-
-import datetime
-import re
-import zipfile
-from dataclasses import dataclass, field
-from enum import Enum
-from io import BytesIO
-from typing import List, Any, Union, Dict, Callable, Optional, Tuple
-
-from energyml.opc.opc import CoreProperties, Relationships, Types, Default, Relationship, Override
-from xsdata.exceptions import ParserError
-from xsdata.formats.dataclass.models.generics import DerivedElement
-
-from .introspection import (
-    get_class_from_content_type,
-    get_obj_type, search_attribute_matching_type, get_obj_version, get_obj_uuid,
-    get_object_type_for_file_path_from_class, get_content_type_from_class, get_direct_dor_list
-)
-from .manager import get_class_pkg, get_class_pkg_version
-from .serialization import (
-    serialize_xml, read_energyml_xml_str, read_energyml_xml_bytes, read_energyml_xml_bytes_as_class
-)
-from .xml import is_energyml_content_type
-
-RELS_CONTENT_TYPE = "application/vnd.openxmlformats-package.core-properties+xml"
-RELS_FOLDER_NAME = "_rels"
-
-
-class NoCrsException(Exception):
-    pass
-
-
-@dataclass
-class ObjectNotFoundNotException(Exception):
-    obj_id: str = field(
-        default=None
-    )
-
-
-class EpcExportVersion(Enum):
-    """EPC export version."""
-    #: Classical export
-    CLASSIC = 1
-    #: Export with objet path sorted by package (eml/resqml/witsml/prodml)
-    EXPANDED = 2
-
-
-class EPCRelsRelationshipType(Enum):
-    #: The object in Target is the destination of the relationship.
-    DESTINATION_OBJECT = "destinationObject"
-    #: The current object is the source in the relationship with the target object.
-    SOURCE_OBJECT = "sourceObject"
-    #: The target object is a proxy object for an external data object (HDF5 file).
-    ML_TO_EXTERNAL_PART_PROXY = "mlToExternalPartProxy"
-    #: The current object is used as a proxy object by the target object.
-    EXTERNAL_PART_PROXY_TO_ML = "externalPartProxyToMl"
-    #: The target is a resource outside of the EPC package. Note that TargetMode should be "External"
-    #: for this relationship.
-    EXTERNAL_RESOURCE = "externalResource"
-    #: The object in Target is a media representation for the current object. As a guideline, media files
-    #: should be stored in a "media" folder in the ROOT of the package.
-    DestinationMedia = "destinationMedia"
-    #: The current object is a media representation for the object in Target.
-    SOURCE_MEDIA = "sourceMedia"
-    #: The target is part of a larger data object that has been chunked into several smaller files
-    CHUNKED_PART = "chunkedPart"
-    #: /!\ not in the norm
-    EXTENDED_CORE_PROPERTIES = "extended-core-properties"
-
-    def get_type(self) -> str:
-        match self:
-            case EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES:
-                return "http://schemas.f2i-consulting.com/package/2014/relationships/" + str(self.value)
-            case (
-            EPCRelsRelationshipType.CHUNKED_PART
-            | EPCRelsRelationshipType.DESTINATION_OBJECT
-            | EPCRelsRelationshipType.SOURCE_OBJECT
-            | EPCRelsRelationshipType.ML_TO_EXTERNAL_PART_PROXY
-            | EPCRelsRelationshipType.EXTERNAL_PART_PROXY_TO_ML
-            | EPCRelsRelationshipType.EXTERNAL_RESOURCE
-            | EPCRelsRelationshipType.DestinationMedia
-            | EPCRelsRelationshipType.SOURCE_MEDIA
-            | _
-            ):
-                return "http://schemas.energistics.org/package/2012/relationships/" + str(self.value)
-
-
-@dataclass
-class RawFile:
-    path: str = field(default="_")
-    content: BytesIO = field(default=None)
-
-
-@dataclass
-class Epc:
-    """
-    A class that represent an EPC file content
-    """
-    # content_type: List[str] = field(
-    #     default_factory=list,
-    # )
-
-    export_version: EpcExportVersion = field(
-        default=EpcExportVersion.CLASSIC
-    )
-
-    core_props: CoreProperties = field(default=None)
-
-    """ xml files refered in the [Content_Types].xml  """
-    energyml_objects: List = field(
-        default_factory=list,
-    )
-
-    """ Other files content like pdf etc """
-    raw_files: List[RawFile] = field(
-        default_factory=list,
-    )
-
-    """ A list of external files. It ca be used to link hdf5 files """
-    external_files_path: List[str] = field(
-        default_factory=list,
-    )
-
-    """ 
-    Additional rels for objects. Key is the object (same than in @energyml_objects) and value is a list of
-    RelationShip. This can be used to link an HDF5 to an ExternalPartReference in resqml 2.0.1
-    Key is a value returned by @get_obj_identifier
-    """
-    additional_rels: Dict[str, List[Relationship]] = field(
-        default_factory=lambda: {}
-    )
-
-    """
-    Epc file path. Used when loaded from a local file or for export
-    """
-    epc_file_path: Optional[str] = field(
-        default=None
-    )
-
-    def __str__(self):
-        return (
-                "EPC file (" + str(self.export_version) + ") "
-                + f"{len(self.energyml_objects)} energyml objects and {len(self.raw_files)} other files {[f.path for f in self.raw_files]}"
-                # + f"\n{[serialize_json(ar) for ar in self.additional_rels]}"
-        )
-
-    # EXPORT functions
-
-    def gen_opc_content_type(self) -> Types:
-        """
-        Generates a :class:`Types` instance and fill it with energyml objects :class:`Override` values
-        :return:
-        """
-        ct = Types()
-        rels_default = Default()
-        rels_default.content_type = RELS_CONTENT_TYPE
-        rels_default.extension = "rels"
-
-        ct.default = [rels_default]
-
-        ct.override = []
-        for e_obj in self.energyml_objects:
-            ct.override.append(Override(
-                content_type=get_content_type_from_class(type(e_obj)),
-                part_name=gen_energyml_object_path(e_obj, self.export_version),
-            ))
-
-        if self.core_props is not None:
-            ct.override.append(Override(
-                content_type=get_content_type_from_class(self.core_props),
-                part_name=gen_core_props_path(self.export_version),
-            ))
-
-        return ct
-
-    def export_file(self, path: Optional[str] = None) -> None:
-        """
-        Export the epc file. If :param:`path` is None, the epc 'self.epc_file_path' is used
-        :param path:
-        :return:
-        """
-        if path is None:
-            path = self.epc_file_path
-        epc_io = self.export_io()
-        with open(path, "wb") as f:
-            f.write(epc_io.getbuffer())
-
-    def export_io(self) -> BytesIO:
-        """
-        Export the epc file into a :class:`BytesIO` instance. The result is an 'in-memory' zip file.
-        :return:
-        """
-        zip_buffer = BytesIO()
-
-        with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file:
-            #  Energyml objects
-            for e_obj in self.energyml_objects:
-                e_path = gen_energyml_object_path(e_obj, self.export_version)
-                zip_info = zipfile.ZipInfo(filename=e_path, date_time=datetime.datetime.now().timetuple()[:6])
-                data = serialize_xml(e_obj)
-                zip_file.writestr(zip_info, data)
-
-            # Rels
-            for rels_path, rels in self.compute_rels().items():
-                zip_info = zipfile.ZipInfo(filename=rels_path, date_time=datetime.datetime.now().timetuple()[:6])
-                data = serialize_xml(rels)
-                zip_file.writestr(zip_info, data)
-
-            # CoreProps
-            if self.core_props is not None:
-                zip_info = zipfile.ZipInfo(filename=gen_core_props_path(self.export_version),
-                                           date_time=datetime.datetime.now().timetuple()[:6])
-                data = serialize_xml(self.core_props)
-                zip_file.writestr(zip_info, data)
-
-            # ContentType
-            zip_info = zipfile.ZipInfo(filename=get_epc_content_type_path(),
-                                       date_time=datetime.datetime.now().timetuple()[:6])
-            data = serialize_xml(self.gen_opc_content_type())
-            zip_file.writestr(zip_info, data)
-
-        return zip_buffer
-
-    def compute_rels(self) -> Dict[str, Relationships]:
-        """
-        Returns a dict containing for each objet, the rels xml file path as key and the RelationShips object as value
-        :return:
-        """
-        dor_relation = get_reverse_dor_list(self.energyml_objects)
-
-        # destObject
-        rels = {
-            obj_id: [
-                Relationship(
-                    target=gen_energyml_object_path(target_obj, self.export_version),
-                    type_value=EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(),
-                    id=f"_{obj_id}_{get_obj_type(target_obj)}_{get_obj_identifier(target_obj)}",
-                ) for target_obj in target_obj_list
-            ]
-            for obj_id, target_obj_list in dor_relation.items()
-        }
-        # sourceObject
-        for obj in self.energyml_objects:
-            obj_id = get_obj_identifier(obj)
-            if obj_id not in rels:
-                rels[obj_id] = []
-            for target_obj in get_direct_dor_list(obj):
-                rels[obj_id].append(Relationship(
-                    target=gen_energyml_object_path(target_obj, self.export_version),
-                    type_value=EPCRelsRelationshipType.SOURCE_OBJECT.get_type(),
-                    id=f"_{obj_id}_{get_obj_type(target_obj)}_{get_obj_identifier(target_obj)}",
-                ))
-
-        map_obj_id_to_obj = {
-            get_obj_identifier(obj): obj
-            for obj in self.energyml_objects
-        }
-
-        obj_rels = {
-            gen_rels_path(energyml_object=map_obj_id_to_obj.get(obj_id), export_version=self.export_version): Relationships(
-                relationship=obj_rels + (self.additional_rels[obj_id] if obj_id in self.additional_rels else []),
-
-            )
-            for obj_id, obj_rels in rels.items()
-        }
-
-        # CoreProps
-        if self.core_props is not None:
-            obj_rels[gen_rels_path(self.core_props)] = Relationships(
-                relationship=[
-                    Relationship(
-                        target=gen_core_props_path(),
-                        type_value=EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES.get_type(),
-                        id="CoreProperties"
-                    )
-                ]
-            )
-
-        return obj_rels
-
-    # -----------
-
-    def get_object_by_uuid(self, uuid: str) -> List[Any]:
-        """
-        Search all objects with the uuid :param:`uuid`.
-        :param uuid:
-        :return:
-        """
-        return list(filter(lambda o: get_obj_uuid(o) == uuid, self.energyml_objects))
-
-    def get_object_by_identifier(self, identifier: str) -> Optional[Any]:
-        """
-        Search an object by its identifier.
-        :param identifier: given by the function :func:`get_obj_identifier`
-        :return:
-        """
-        for o in self.energyml_objects:
-            if get_obj_identifier(o) == identifier:
-                return o
-        return None
-
-    def get_epc_file_folder(self) -> Optional[str]:
-        if self.epc_file_path is not None and len(self.epc_file_path) > 0:
-            folders_and_name = re.split(r"[\\/]", self.epc_file_path)
-            if len(folders_and_name) > 1:
-                return "/".join(folders_and_name[:-1])
-            else:
-                return ""
-        return None
-
-    # Class methods
-
-    @classmethod
-    def read_file(cls, epc_file_path: str):
-        with open(epc_file_path, "rb") as f:
-            epc = cls.read_stream(BytesIO(f.read()))
-            epc.epc_file_path = epc_file_path
-            return epc
-
-    @classmethod
-    def read_stream(cls, epc_file_io: BytesIO):  # returns an Epc instance
-        """
-        :param epc_file_io:
-        :return: an :class:`EPC` instance
-        """
-        try:
-            _read_files = []
-            obj_list = []
-            raw_file_list = []
-            additional_rels = {}
-            core_props = None
-            with zipfile.ZipFile(epc_file_io, "r", zipfile.ZIP_DEFLATED) as epc_file:
-                content_type_file_name = get_epc_content_type_path()
-                content_type_info = None
-                try:
-                    content_type_info = epc_file.getinfo(content_type_file_name)
-                except KeyError:
-                    for info in epc_file.infolist():
-                        if info.filename.lower() == content_type_file_name.lower():
-                            content_type_info = info
-                            break
-
-                _read_files.append(content_type_file_name)
-
-                if content_type_info is None:
-                    print(f"No {content_type_file_name} file found")
-                else:
-                    content_type_obj: Types = read_energyml_xml_bytes(epc_file.read(content_type_file_name))
-                    path_to_obj = {}
-                    for ov in content_type_obj.override:
-                        ov_ct = ov.content_type
-                        ov_path = ov.part_name
-                        # print(ov_ct)
-                        while ov_path.startswith("/") or ov_path.startswith("\\"):
-                            ov_path = ov_path[1:]
-                        if is_energyml_content_type(ov_ct):
-                            _read_files.append(ov_path)
-                            try:
-                                ov_obj = read_energyml_xml_bytes_as_class(
-                                    epc_file.read(ov_path),
-                                    get_class_from_content_type(ov_ct)
-                                )
-                                if isinstance(ov_obj, DerivedElement):
-                                    ov_obj = ov_obj.value
-                                path_to_obj[ov_path] = ov_obj
-                                obj_list.append(ov_obj)
-                            except ParserError as e:
-                                print(f"Epc.@read_stream failed to parse file {ov_path} for content-type: {ov_ct} => {get_class_from_content_type(ov_ct)}")
-                                raise e
-                        elif get_class_from_content_type(ov_ct) == CoreProperties:
-                            _read_files.append(ov_path)
-                            core_props = read_energyml_xml_bytes_as_class(epc_file.read(ov_path), CoreProperties)
-
-                    for f_info in epc_file.infolist():
-                        if f_info.filename not in _read_files:
-                            _read_files.append(f_info.filename)
-                            if not f_info.filename.lower().endswith(".rels"):
-                                try:
-                                    raw_file_list.append(
-                                        RawFile(
-                                            path=f_info.filename,
-                                            content=BytesIO(epc_file.read(f_info.filename)),
-                                        )
-                                    )
-                                except IOError as e:
-                                    print(e)
-                            else:  # rels
-                                # print(f"reading rels {f_info.filename}")
-                                rels_folder, rels_file_name = get_file_folder_and_name_from_path(f_info.filename)
-                                while rels_folder.endswith("/"):
-                                    rels_folder = rels_folder[:-1]
-                                obj_folder = rels_folder[:rels_folder.rindex("/") + 1] if "/" in rels_folder else ""
-                                obj_file_name = rels_file_name[:-5]  # removing the ".rels"
-                                rels_file: Relationships = read_energyml_xml_bytes_as_class(
-                                    epc_file.read(f_info.filename),
-                                    Relationships
-                                )
-                                obj_path = obj_folder + obj_file_name
-                                if obj_path in path_to_obj:
-                                    try:
-                                        additional_rels_key = get_obj_identifier(path_to_obj[obj_path])
-                                        for rel in rels_file.relationship:
-                                            # print(f"\t\t{rel.type_value}")
-                                            if (rel.type_value != EPCRelsRelationshipType.DESTINATION_OBJECT.get_type()
-                                                    and rel.type_value != EPCRelsRelationshipType.SOURCE_OBJECT.get_type()
-                                                    and rel.type_value != EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES.get_type()
-                                            ):  # not a computable relation
-                                                if additional_rels_key not in additional_rels:
-                                                    additional_rels[additional_rels_key] = []
-                                                additional_rels[additional_rels_key].append(rel)
-                                    except Exception as e:
-                                        print(f"Error with obj path {obj_path} {path_to_obj[obj_path]}")
-                                        raise e
-                                else:
-                                    print(f"xml file {obj_path} not found in EPC (rels is not associate to any object)")
-
-            return Epc(energyml_objects=obj_list,
-                       raw_files=raw_file_list,
-                       core_props=core_props,
-                       additional_rels=additional_rels
-                       )
-        except zipfile.BadZipFile as error:
-            print(error)
-
-        return None
-
-
-#     ______                                      __   ____                 __  _
-#    / ____/___  ___  _________ ___  ______ ___  / /  / __/_  ______  _____/ /_(_)___  ____  _____
-#   / __/ / __ \/ _ \/ ___/ __ `/ / / / __ `__ \/ /  / /_/ / / / __ \/ ___/ __/ / __ \/ __ \/ ___/
-#  / /___/ / / /  __/ /  / /_/ / /_/ / / / / / / /  / __/ /_/ / / / / /__/ /_/ / /_/ / / / (__  )
-# /_____/_/ /_/\___/_/   \__, /\__, /_/ /_/ /_/_/  /_/  \__,_/_/ /_/\___/\__/_/\____/_/ /_/____/
-#                       /____//____/
-
-
-def get_obj_identifier(obj: Any) -> str:
-    """
-    Generates an objet identifier as : 'OBJ_UUID.OBJ_VERSION'
-    If the object version is None, the result is 'OBJ_UUID.'
-    :param obj:
-    :return: str
-    """
-    obj_obj_version = get_obj_version(obj)
-    if obj_obj_version is None:
-        obj_obj_version = ""
-    obj_uuid = get_obj_uuid(obj)
-    return f"{obj_uuid}.{obj_obj_version}"
-
-
-def get_reverse_dor_list(obj_list: List[Any], key_func: Callable = get_obj_identifier) -> Dict[str, List[Any]]:
-    """
-    Compute a dict with 'OBJ_UUID.OBJ_VERSION' as Key, and list of DOR that reference it.
-    If the object version is None, key is 'OBJ_UUID.'
-    :param obj_list:
-    :param key_func: a callable to create the key of the dict from the object instance
-    :return: str
-    """
-    rels = {}
-    for obj in obj_list:
-        for dor in search_attribute_matching_type(obj, "DataObjectReference", return_self=False):
-            key = key_func(dor)
-            if key not in rels:
-                rels[key] = []
-            rels[key] = rels.get(key, []) + [obj]
-    return rels
-
-
-# PATHS
-
-
-def gen_core_props_path(export_version: EpcExportVersion = EpcExportVersion.CLASSIC):
-    return "docProps/core.xml"
-
-
-def gen_energyml_object_path(energyml_object: Union[str, Any],
-                             export_version: EpcExportVersion = EpcExportVersion.CLASSIC):
-    """
-    Generate a path to store the :param:`energyml_object` into an epc file (depending on the :param:`export_version`)
-    :param energyml_object:
-    :param export_version:
-    :return:
-    """
-    if isinstance(energyml_object, str):
-        energyml_object = read_energyml_xml_str(energyml_object)
-
-    obj_type = get_object_type_for_file_path_from_class(energyml_object.__class__)
-
-    pkg = get_class_pkg(energyml_object)
-    pkg_version = get_class_pkg_version(energyml_object)
-    object_version = get_obj_version(energyml_object)
-    uuid = get_obj_uuid(energyml_object)
-
-    # if object_version is None:
-    #     object_version = "0"
-
-    if export_version == EpcExportVersion.EXPANDED:
-        return f"namespace_{pkg}{pkg_version.replace('.', '')}/{uuid}{('/version_' + object_version) if object_version is not None else ''}/{obj_type}_{uuid}.xml"
-    else:
-        return obj_type + "_" + uuid + ".xml"
-
-
-def get_file_folder_and_name_from_path(path: str) -> Tuple[str, str]:
-    """
-    Returns a tuple (FOLDER_PATH, FILE_NAME)
-    :param path:
-    :return:
-    """
-    obj_folder = path[:path.rindex("/") + 1] if "/" in path else ""
-    obj_file_name = path[path.rindex("/") + 1:] if "/" in path else path
-    return obj_folder, obj_file_name
-
-
-def gen_rels_path(energyml_object: Any,
-                  export_version: EpcExportVersion = EpcExportVersion.CLASSIC
-                  ) -> str:
-    """
-    Generate a path to store the :param:`energyml_object` rels file into an epc file
-    (depending on the :param:`export_version`)
-    :param energyml_object:
-    :param export_version:
-    :return:
-    """
-    if isinstance(obj, CoreProperties):
-        return f"{RELS_FOLDER_NAME}/.rels"
-    else:
-        obj_path = gen_energyml_object_path(obj, export_version)
-        obj_folder, obj_file_name = get_file_folder_and_name_from_path(obj_path, )
-        return f"{obj_folder}{RELS_FOLDER_NAME}/{obj_file_name}.rels"
-
-
-def get_epc_content_type_path(export_version: EpcExportVersion = EpcExportVersion.CLASSIC) -> str:
-    """
-    Generate a path to store the "[Content_Types].xml" file into an epc file
-    (depending on the :param:`export_version`)
-    :return:
-    """
-    return "[Content_Types].xml"
-
-
-
-
-
-
-
-

Functions

-
-
-def gen_core_props_path(export_version: EpcExportVersion = EpcExportVersion.CLASSIC) -
-
-
-
- -Expand source code - -
def gen_core_props_path(export_version: EpcExportVersion = EpcExportVersion.CLASSIC):
-    return "docProps/core.xml"
-
-
-
-def gen_energyml_object_path(energyml_object: Union[str, Any], export_version: EpcExportVersion = EpcExportVersion.CLASSIC) -
-
-

Generate a path to store the :param:energyml_object into an epc file (depending on the :param:export_version) -:param energyml_object: -:param export_version: -:return:

-
- -Expand source code - -
def gen_energyml_object_path(energyml_object: Union[str, Any],
-                             export_version: EpcExportVersion = EpcExportVersion.CLASSIC):
-    """
-    Generate a path to store the :param:`energyml_object` into an epc file (depending on the :param:`export_version`)
-    :param energyml_object:
-    :param export_version:
-    :return:
-    """
-    if isinstance(energyml_object, str):
-        energyml_object = read_energyml_xml_str(energyml_object)
-
-    obj_type = get_object_type_for_file_path_from_class(energyml_object.__class__)
-
-    pkg = get_class_pkg(energyml_object)
-    pkg_version = get_class_pkg_version(energyml_object)
-    object_version = get_obj_version(energyml_object)
-    uuid = get_obj_uuid(energyml_object)
-
-    # if object_version is None:
-    #     object_version = "0"
-
-    if export_version == EpcExportVersion.EXPANDED:
-        return f"namespace_{pkg}{pkg_version.replace('.', '')}/{uuid}{('/version_' + object_version) if object_version is not None else ''}/{obj_type}_{uuid}.xml"
-    else:
-        return obj_type + "_" + uuid + ".xml"
-
-
-
-def gen_rels_path(energyml_object: Any, export_version: EpcExportVersion = EpcExportVersion.CLASSIC) ‑> str -
-
-

Generate a path to store the :param:energyml_object rels file into an epc file -(depending on the :param:export_version) -:param energyml_object: -:param export_version: -:return:

-
- -Expand source code - -
def gen_rels_path(energyml_object: Any,
-                  export_version: EpcExportVersion = EpcExportVersion.CLASSIC
-                  ) -> str:
-    """
-    Generate a path to store the :param:`energyml_object` rels file into an epc file
-    (depending on the :param:`export_version`)
-    :param energyml_object:
-    :param export_version:
-    :return:
-    """
-    if isinstance(obj, CoreProperties):
-        return f"{RELS_FOLDER_NAME}/.rels"
-    else:
-        obj_path = gen_energyml_object_path(obj, export_version)
-        obj_folder, obj_file_name = get_file_folder_and_name_from_path(obj_path, )
-        return f"{obj_folder}{RELS_FOLDER_NAME}/{obj_file_name}.rels"
-
-
-
-def get_epc_content_type_path(export_version: EpcExportVersion = EpcExportVersion.CLASSIC) ‑> str -
-
-

Generate a path to store the "[Content_Types].xml" file into an epc file -(depending on the :param:export_version) -:return:

-
- -Expand source code - -
def get_epc_content_type_path(export_version: EpcExportVersion = EpcExportVersion.CLASSIC) -> str:
-    """
-    Generate a path to store the "[Content_Types].xml" file into an epc file
-    (depending on the :param:`export_version`)
-    :return:
-    """
-    return "[Content_Types].xml"
-
-
-
-def get_file_folder_and_name_from_path(path: str) ‑> Tuple[str, str] -
-
-

Returns a tuple (FOLDER_PATH, FILE_NAME) -:param path: -:return:

-
- -Expand source code - -
def get_file_folder_and_name_from_path(path: str) -> Tuple[str, str]:
-    """
-    Returns a tuple (FOLDER_PATH, FILE_NAME)
-    :param path:
-    :return:
-    """
-    obj_folder = path[:path.rindex("/") + 1] if "/" in path else ""
-    obj_file_name = path[path.rindex("/") + 1:] if "/" in path else path
-    return obj_folder, obj_file_name
-
-
-
-def get_obj_identifier(obj: Any) ‑> str -
-
-

Generates an objet identifier as : 'OBJ_UUID.OBJ_VERSION' -If the object version is None, the result is 'OBJ_UUID.' -:param obj: -:return: str

-
- -Expand source code - -
def get_obj_identifier(obj: Any) -> str:
-    """
-    Generates an objet identifier as : 'OBJ_UUID.OBJ_VERSION'
-    If the object version is None, the result is 'OBJ_UUID.'
-    :param obj:
-    :return: str
-    """
-    obj_obj_version = get_obj_version(obj)
-    if obj_obj_version is None:
-        obj_obj_version = ""
-    obj_uuid = get_obj_uuid(obj)
-    return f"{obj_uuid}.{obj_obj_version}"
-
-
-
-def get_reverse_dor_list(obj_list: List[Any], key_func: Callable = <function get_obj_identifier>) ‑> Dict[str, List[Any]] -
-
-

Compute a dict with 'OBJ_UUID.OBJ_VERSION' as Key, and list of DOR that reference it. -If the object version is None, key is 'OBJ_UUID.' -:param obj_list: -:param key_func: a callable to create the key of the dict from the object instance -:return: str

-
- -Expand source code - -
def get_reverse_dor_list(obj_list: List[Any], key_func: Callable = get_obj_identifier) -> Dict[str, List[Any]]:
-    """
-    Compute a dict with 'OBJ_UUID.OBJ_VERSION' as Key, and list of DOR that reference it.
-    If the object version is None, key is 'OBJ_UUID.'
-    :param obj_list:
-    :param key_func: a callable to create the key of the dict from the object instance
-    :return: str
-    """
-    rels = {}
-    for obj in obj_list:
-        for dor in search_attribute_matching_type(obj, "DataObjectReference", return_self=False):
-            key = key_func(dor)
-            if key not in rels:
-                rels[key] = []
-            rels[key] = rels.get(key, []) + [obj]
-    return rels
-
-
-
-
-
-

Classes

-
-
-class EPCRelsRelationshipType -(*args, **kwds) -
-
-

Create a collection of name/value pairs.

-

Example enumeration:

-
>>> class Color(Enum):
-...     RED = 1
-...     BLUE = 2
-...     GREEN = 3
-
-

Access them by:

-
    -
  • attribute access::
  • -
-
>>> Color.RED
-<Color.RED: 1>
-
-
    -
  • value lookup:
  • -
-
>>> Color(1)
-<Color.RED: 1>
-
-
    -
  • name lookup:
  • -
-
>>> Color['RED']
-<Color.RED: 1>
-
-

Enumerations can be iterated over, and know how many members they have:

-
>>> len(Color)
-3
-
-
>>> list(Color)
-[<Color.RED: 1>, <Color.BLUE: 2>, <Color.GREEN: 3>]
-
-

Methods can be added to enumerations, and members can have their own -attributes – see the documentation for details.

-
- -Expand source code - -
class EPCRelsRelationshipType(Enum):
-    #: The object in Target is the destination of the relationship.
-    DESTINATION_OBJECT = "destinationObject"
-    #: The current object is the source in the relationship with the target object.
-    SOURCE_OBJECT = "sourceObject"
-    #: The target object is a proxy object for an external data object (HDF5 file).
-    ML_TO_EXTERNAL_PART_PROXY = "mlToExternalPartProxy"
-    #: The current object is used as a proxy object by the target object.
-    EXTERNAL_PART_PROXY_TO_ML = "externalPartProxyToMl"
-    #: The target is a resource outside of the EPC package. Note that TargetMode should be "External"
-    #: for this relationship.
-    EXTERNAL_RESOURCE = "externalResource"
-    #: The object in Target is a media representation for the current object. As a guideline, media files
-    #: should be stored in a "media" folder in the ROOT of the package.
-    DestinationMedia = "destinationMedia"
-    #: The current object is a media representation for the object in Target.
-    SOURCE_MEDIA = "sourceMedia"
-    #: The target is part of a larger data object that has been chunked into several smaller files
-    CHUNKED_PART = "chunkedPart"
-    #: /!\ not in the norm
-    EXTENDED_CORE_PROPERTIES = "extended-core-properties"
-
-    def get_type(self) -> str:
-        match self:
-            case EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES:
-                return "http://schemas.f2i-consulting.com/package/2014/relationships/" + str(self.value)
-            case (
-            EPCRelsRelationshipType.CHUNKED_PART
-            | EPCRelsRelationshipType.DESTINATION_OBJECT
-            | EPCRelsRelationshipType.SOURCE_OBJECT
-            | EPCRelsRelationshipType.ML_TO_EXTERNAL_PART_PROXY
-            | EPCRelsRelationshipType.EXTERNAL_PART_PROXY_TO_ML
-            | EPCRelsRelationshipType.EXTERNAL_RESOURCE
-            | EPCRelsRelationshipType.DestinationMedia
-            | EPCRelsRelationshipType.SOURCE_MEDIA
-            | _
-            ):
-                return "http://schemas.energistics.org/package/2012/relationships/" + str(self.value)
-
-

Ancestors

-
    -
  • enum.Enum
  • -
-

Class variables

-
-
var CHUNKED_PART
-
-

The target is part of a larger data object that has been chunked into several smaller files

-
-
var DESTINATION_OBJECT
-
-

The object in Target is the destination of the relationship.

-
-
var DestinationMedia
-
-

The object in Target is a media representation for the current object. As a guideline, media files -should be stored in a "media" folder in the ROOT of the package.

-
-
var EXTENDED_CORE_PROPERTIES
-
-

/!\ not in the norm

-
-
var EXTERNAL_PART_PROXY_TO_ML
-
-

The current object is used as a proxy object by the target object.

-
-
var EXTERNAL_RESOURCE
-
-

The target is a resource outside of the EPC package. Note that TargetMode should be "External" -for this relationship.

-
-
var ML_TO_EXTERNAL_PART_PROXY
-
-

The target object is a proxy object for an external data object (HDF5 file).

-
-
var SOURCE_MEDIA
-
-

The current object is a media representation for the object in Target.

-
-
var SOURCE_OBJECT
-
-

The current object is the source in the relationship with the target object.

-
-
-

Methods

-
-
-def get_type(self) ‑> str -
-
-
-
- -Expand source code - -
def get_type(self) -> str:
-    match self:
-        case EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES:
-            return "http://schemas.f2i-consulting.com/package/2014/relationships/" + str(self.value)
-        case (
-        EPCRelsRelationshipType.CHUNKED_PART
-        | EPCRelsRelationshipType.DESTINATION_OBJECT
-        | EPCRelsRelationshipType.SOURCE_OBJECT
-        | EPCRelsRelationshipType.ML_TO_EXTERNAL_PART_PROXY
-        | EPCRelsRelationshipType.EXTERNAL_PART_PROXY_TO_ML
-        | EPCRelsRelationshipType.EXTERNAL_RESOURCE
-        | EPCRelsRelationshipType.DestinationMedia
-        | EPCRelsRelationshipType.SOURCE_MEDIA
-        | _
-        ):
-            return "http://schemas.energistics.org/package/2012/relationships/" + str(self.value)
-
-
-
-
-
-class Epc -(export_version: EpcExportVersion = EpcExportVersion.CLASSIC, core_props: energyml.opc.opc.CoreProperties = None, energyml_objects: List = <factory>, raw_files: List[RawFile] = <factory>, external_files_path: List[str] = <factory>, additional_rels: Dict[str, List[energyml.opc.opc.Relationship]] = <factory>, epc_file_path: Optional[str] = None) -
-
-

A class that represent an EPC file content

-
- -Expand source code - -
@dataclass
-class Epc:
-    """
-    A class that represent an EPC file content
-    """
-    # content_type: List[str] = field(
-    #     default_factory=list,
-    # )
-
-    export_version: EpcExportVersion = field(
-        default=EpcExportVersion.CLASSIC
-    )
-
-    core_props: CoreProperties = field(default=None)
-
-    """ xml files refered in the [Content_Types].xml  """
-    energyml_objects: List = field(
-        default_factory=list,
-    )
-
-    """ Other files content like pdf etc """
-    raw_files: List[RawFile] = field(
-        default_factory=list,
-    )
-
-    """ A list of external files. It ca be used to link hdf5 files """
-    external_files_path: List[str] = field(
-        default_factory=list,
-    )
-
-    """ 
-    Additional rels for objects. Key is the object (same than in @energyml_objects) and value is a list of
-    RelationShip. This can be used to link an HDF5 to an ExternalPartReference in resqml 2.0.1
-    Key is a value returned by @get_obj_identifier
-    """
-    additional_rels: Dict[str, List[Relationship]] = field(
-        default_factory=lambda: {}
-    )
-
-    """
-    Epc file path. Used when loaded from a local file or for export
-    """
-    epc_file_path: Optional[str] = field(
-        default=None
-    )
-
-    def __str__(self):
-        return (
-                "EPC file (" + str(self.export_version) + ") "
-                + f"{len(self.energyml_objects)} energyml objects and {len(self.raw_files)} other files {[f.path for f in self.raw_files]}"
-                # + f"\n{[serialize_json(ar) for ar in self.additional_rels]}"
-        )
-
-    # EXPORT functions
-
-    def gen_opc_content_type(self) -> Types:
-        """
-        Generates a :class:`Types` instance and fill it with energyml objects :class:`Override` values
-        :return:
-        """
-        ct = Types()
-        rels_default = Default()
-        rels_default.content_type = RELS_CONTENT_TYPE
-        rels_default.extension = "rels"
-
-        ct.default = [rels_default]
-
-        ct.override = []
-        for e_obj in self.energyml_objects:
-            ct.override.append(Override(
-                content_type=get_content_type_from_class(type(e_obj)),
-                part_name=gen_energyml_object_path(e_obj, self.export_version),
-            ))
-
-        if self.core_props is not None:
-            ct.override.append(Override(
-                content_type=get_content_type_from_class(self.core_props),
-                part_name=gen_core_props_path(self.export_version),
-            ))
-
-        return ct
-
-    def export_file(self, path: Optional[str] = None) -> None:
-        """
-        Export the epc file. If :param:`path` is None, the epc 'self.epc_file_path' is used
-        :param path:
-        :return:
-        """
-        if path is None:
-            path = self.epc_file_path
-        epc_io = self.export_io()
-        with open(path, "wb") as f:
-            f.write(epc_io.getbuffer())
-
-    def export_io(self) -> BytesIO:
-        """
-        Export the epc file into a :class:`BytesIO` instance. The result is an 'in-memory' zip file.
-        :return:
-        """
-        zip_buffer = BytesIO()
-
-        with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file:
-            #  Energyml objects
-            for e_obj in self.energyml_objects:
-                e_path = gen_energyml_object_path(e_obj, self.export_version)
-                zip_info = zipfile.ZipInfo(filename=e_path, date_time=datetime.datetime.now().timetuple()[:6])
-                data = serialize_xml(e_obj)
-                zip_file.writestr(zip_info, data)
-
-            # Rels
-            for rels_path, rels in self.compute_rels().items():
-                zip_info = zipfile.ZipInfo(filename=rels_path, date_time=datetime.datetime.now().timetuple()[:6])
-                data = serialize_xml(rels)
-                zip_file.writestr(zip_info, data)
-
-            # CoreProps
-            if self.core_props is not None:
-                zip_info = zipfile.ZipInfo(filename=gen_core_props_path(self.export_version),
-                                           date_time=datetime.datetime.now().timetuple()[:6])
-                data = serialize_xml(self.core_props)
-                zip_file.writestr(zip_info, data)
-
-            # ContentType
-            zip_info = zipfile.ZipInfo(filename=get_epc_content_type_path(),
-                                       date_time=datetime.datetime.now().timetuple()[:6])
-            data = serialize_xml(self.gen_opc_content_type())
-            zip_file.writestr(zip_info, data)
-
-        return zip_buffer
-
-    def compute_rels(self) -> Dict[str, Relationships]:
-        """
-        Returns a dict containing for each objet, the rels xml file path as key and the RelationShips object as value
-        :return:
-        """
-        dor_relation = get_reverse_dor_list(self.energyml_objects)
-
-        # destObject
-        rels = {
-            obj_id: [
-                Relationship(
-                    target=gen_energyml_object_path(target_obj, self.export_version),
-                    type_value=EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(),
-                    id=f"_{obj_id}_{get_obj_type(target_obj)}_{get_obj_identifier(target_obj)}",
-                ) for target_obj in target_obj_list
-            ]
-            for obj_id, target_obj_list in dor_relation.items()
-        }
-        # sourceObject
-        for obj in self.energyml_objects:
-            obj_id = get_obj_identifier(obj)
-            if obj_id not in rels:
-                rels[obj_id] = []
-            for target_obj in get_direct_dor_list(obj):
-                rels[obj_id].append(Relationship(
-                    target=gen_energyml_object_path(target_obj, self.export_version),
-                    type_value=EPCRelsRelationshipType.SOURCE_OBJECT.get_type(),
-                    id=f"_{obj_id}_{get_obj_type(target_obj)}_{get_obj_identifier(target_obj)}",
-                ))
-
-        map_obj_id_to_obj = {
-            get_obj_identifier(obj): obj
-            for obj in self.energyml_objects
-        }
-
-        obj_rels = {
-            gen_rels_path(energyml_object=map_obj_id_to_obj.get(obj_id), export_version=self.export_version): Relationships(
-                relationship=obj_rels + (self.additional_rels[obj_id] if obj_id in self.additional_rels else []),
-
-            )
-            for obj_id, obj_rels in rels.items()
-        }
-
-        # CoreProps
-        if self.core_props is not None:
-            obj_rels[gen_rels_path(self.core_props)] = Relationships(
-                relationship=[
-                    Relationship(
-                        target=gen_core_props_path(),
-                        type_value=EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES.get_type(),
-                        id="CoreProperties"
-                    )
-                ]
-            )
-
-        return obj_rels
-
-    # -----------
-
-    def get_object_by_uuid(self, uuid: str) -> List[Any]:
-        """
-        Search all objects with the uuid :param:`uuid`.
-        :param uuid:
-        :return:
-        """
-        return list(filter(lambda o: get_obj_uuid(o) == uuid, self.energyml_objects))
-
-    def get_object_by_identifier(self, identifier: str) -> Optional[Any]:
-        """
-        Search an object by its identifier.
-        :param identifier: given by the function :func:`get_obj_identifier`
-        :return:
-        """
-        for o in self.energyml_objects:
-            if get_obj_identifier(o) == identifier:
-                return o
-        return None
-
-    def get_epc_file_folder(self) -> Optional[str]:
-        if self.epc_file_path is not None and len(self.epc_file_path) > 0:
-            folders_and_name = re.split(r"[\\/]", self.epc_file_path)
-            if len(folders_and_name) > 1:
-                return "/".join(folders_and_name[:-1])
-            else:
-                return ""
-        return None
-
-    # Class methods
-
-    @classmethod
-    def read_file(cls, epc_file_path: str):
-        with open(epc_file_path, "rb") as f:
-            epc = cls.read_stream(BytesIO(f.read()))
-            epc.epc_file_path = epc_file_path
-            return epc
-
-    @classmethod
-    def read_stream(cls, epc_file_io: BytesIO):  # returns an Epc instance
-        """
-        :param epc_file_io:
-        :return: an :class:`EPC` instance
-        """
-        try:
-            _read_files = []
-            obj_list = []
-            raw_file_list = []
-            additional_rels = {}
-            core_props = None
-            with zipfile.ZipFile(epc_file_io, "r", zipfile.ZIP_DEFLATED) as epc_file:
-                content_type_file_name = get_epc_content_type_path()
-                content_type_info = None
-                try:
-                    content_type_info = epc_file.getinfo(content_type_file_name)
-                except KeyError:
-                    for info in epc_file.infolist():
-                        if info.filename.lower() == content_type_file_name.lower():
-                            content_type_info = info
-                            break
-
-                _read_files.append(content_type_file_name)
-
-                if content_type_info is None:
-                    print(f"No {content_type_file_name} file found")
-                else:
-                    content_type_obj: Types = read_energyml_xml_bytes(epc_file.read(content_type_file_name))
-                    path_to_obj = {}
-                    for ov in content_type_obj.override:
-                        ov_ct = ov.content_type
-                        ov_path = ov.part_name
-                        # print(ov_ct)
-                        while ov_path.startswith("/") or ov_path.startswith("\\"):
-                            ov_path = ov_path[1:]
-                        if is_energyml_content_type(ov_ct):
-                            _read_files.append(ov_path)
-                            try:
-                                ov_obj = read_energyml_xml_bytes_as_class(
-                                    epc_file.read(ov_path),
-                                    get_class_from_content_type(ov_ct)
-                                )
-                                if isinstance(ov_obj, DerivedElement):
-                                    ov_obj = ov_obj.value
-                                path_to_obj[ov_path] = ov_obj
-                                obj_list.append(ov_obj)
-                            except ParserError as e:
-                                print(f"Epc.@read_stream failed to parse file {ov_path} for content-type: {ov_ct} => {get_class_from_content_type(ov_ct)}")
-                                raise e
-                        elif get_class_from_content_type(ov_ct) == CoreProperties:
-                            _read_files.append(ov_path)
-                            core_props = read_energyml_xml_bytes_as_class(epc_file.read(ov_path), CoreProperties)
-
-                    for f_info in epc_file.infolist():
-                        if f_info.filename not in _read_files:
-                            _read_files.append(f_info.filename)
-                            if not f_info.filename.lower().endswith(".rels"):
-                                try:
-                                    raw_file_list.append(
-                                        RawFile(
-                                            path=f_info.filename,
-                                            content=BytesIO(epc_file.read(f_info.filename)),
-                                        )
-                                    )
-                                except IOError as e:
-                                    print(e)
-                            else:  # rels
-                                # print(f"reading rels {f_info.filename}")
-                                rels_folder, rels_file_name = get_file_folder_and_name_from_path(f_info.filename)
-                                while rels_folder.endswith("/"):
-                                    rels_folder = rels_folder[:-1]
-                                obj_folder = rels_folder[:rels_folder.rindex("/") + 1] if "/" in rels_folder else ""
-                                obj_file_name = rels_file_name[:-5]  # removing the ".rels"
-                                rels_file: Relationships = read_energyml_xml_bytes_as_class(
-                                    epc_file.read(f_info.filename),
-                                    Relationships
-                                )
-                                obj_path = obj_folder + obj_file_name
-                                if obj_path in path_to_obj:
-                                    try:
-                                        additional_rels_key = get_obj_identifier(path_to_obj[obj_path])
-                                        for rel in rels_file.relationship:
-                                            # print(f"\t\t{rel.type_value}")
-                                            if (rel.type_value != EPCRelsRelationshipType.DESTINATION_OBJECT.get_type()
-                                                    and rel.type_value != EPCRelsRelationshipType.SOURCE_OBJECT.get_type()
-                                                    and rel.type_value != EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES.get_type()
-                                            ):  # not a computable relation
-                                                if additional_rels_key not in additional_rels:
-                                                    additional_rels[additional_rels_key] = []
-                                                additional_rels[additional_rels_key].append(rel)
-                                    except Exception as e:
-                                        print(f"Error with obj path {obj_path} {path_to_obj[obj_path]}")
-                                        raise e
-                                else:
-                                    print(f"xml file {obj_path} not found in EPC (rels is not associate to any object)")
-
-            return Epc(energyml_objects=obj_list,
-                       raw_files=raw_file_list,
-                       core_props=core_props,
-                       additional_rels=additional_rels
-                       )
-        except zipfile.BadZipFile as error:
-            print(error)
-
-        return None
-
-

Class variables

-
-
var additional_rels : Dict[str, List[energyml.opc.opc.Relationship]]
-
-

Epc file path. Used when loaded from a local file or for export

-
-
var core_props : energyml.opc.opc.CoreProperties
-
-

xml files refered in the [Content_Types].xml

-
-
var energyml_objects : List
-
-

Other files content like pdf etc

-
-
var epc_file_path : Optional[str]
-
-
-
-
var export_versionEpcExportVersion
-
-
-
-
var external_files_path : List[str]
-
-

Additional rels for objects. Key is the object (same than in @energyml_objects) and value is a list of -RelationShip. This can be used to link an HDF5 to an ExternalPartReference in resqml 2.0.1 -Key is a value returned by @get_obj_identifier

-
-
var raw_files : List[RawFile]
-
-

A list of external files. It ca be used to link hdf5 files

-
-
-

Static methods

-
-
-def read_file(epc_file_path: str) -
-
-
-
- -Expand source code - -
@classmethod
-def read_file(cls, epc_file_path: str):
-    with open(epc_file_path, "rb") as f:
-        epc = cls.read_stream(BytesIO(f.read()))
-        epc.epc_file_path = epc_file_path
-        return epc
-
-
-
-def read_stream(epc_file_io: _io.BytesIO) -
-
-

:param epc_file_io: -:return: an :class:EPC instance

-
- -Expand source code - -
@classmethod
-def read_stream(cls, epc_file_io: BytesIO):  # returns an Epc instance
-    """
-    :param epc_file_io:
-    :return: an :class:`EPC` instance
-    """
-    try:
-        _read_files = []
-        obj_list = []
-        raw_file_list = []
-        additional_rels = {}
-        core_props = None
-        with zipfile.ZipFile(epc_file_io, "r", zipfile.ZIP_DEFLATED) as epc_file:
-            content_type_file_name = get_epc_content_type_path()
-            content_type_info = None
-            try:
-                content_type_info = epc_file.getinfo(content_type_file_name)
-            except KeyError:
-                for info in epc_file.infolist():
-                    if info.filename.lower() == content_type_file_name.lower():
-                        content_type_info = info
-                        break
-
-            _read_files.append(content_type_file_name)
-
-            if content_type_info is None:
-                print(f"No {content_type_file_name} file found")
-            else:
-                content_type_obj: Types = read_energyml_xml_bytes(epc_file.read(content_type_file_name))
-                path_to_obj = {}
-                for ov in content_type_obj.override:
-                    ov_ct = ov.content_type
-                    ov_path = ov.part_name
-                    # print(ov_ct)
-                    while ov_path.startswith("/") or ov_path.startswith("\\"):
-                        ov_path = ov_path[1:]
-                    if is_energyml_content_type(ov_ct):
-                        _read_files.append(ov_path)
-                        try:
-                            ov_obj = read_energyml_xml_bytes_as_class(
-                                epc_file.read(ov_path),
-                                get_class_from_content_type(ov_ct)
-                            )
-                            if isinstance(ov_obj, DerivedElement):
-                                ov_obj = ov_obj.value
-                            path_to_obj[ov_path] = ov_obj
-                            obj_list.append(ov_obj)
-                        except ParserError as e:
-                            print(f"Epc.@read_stream failed to parse file {ov_path} for content-type: {ov_ct} => {get_class_from_content_type(ov_ct)}")
-                            raise e
-                    elif get_class_from_content_type(ov_ct) == CoreProperties:
-                        _read_files.append(ov_path)
-                        core_props = read_energyml_xml_bytes_as_class(epc_file.read(ov_path), CoreProperties)
-
-                for f_info in epc_file.infolist():
-                    if f_info.filename not in _read_files:
-                        _read_files.append(f_info.filename)
-                        if not f_info.filename.lower().endswith(".rels"):
-                            try:
-                                raw_file_list.append(
-                                    RawFile(
-                                        path=f_info.filename,
-                                        content=BytesIO(epc_file.read(f_info.filename)),
-                                    )
-                                )
-                            except IOError as e:
-                                print(e)
-                        else:  # rels
-                            # print(f"reading rels {f_info.filename}")
-                            rels_folder, rels_file_name = get_file_folder_and_name_from_path(f_info.filename)
-                            while rels_folder.endswith("/"):
-                                rels_folder = rels_folder[:-1]
-                            obj_folder = rels_folder[:rels_folder.rindex("/") + 1] if "/" in rels_folder else ""
-                            obj_file_name = rels_file_name[:-5]  # removing the ".rels"
-                            rels_file: Relationships = read_energyml_xml_bytes_as_class(
-                                epc_file.read(f_info.filename),
-                                Relationships
-                            )
-                            obj_path = obj_folder + obj_file_name
-                            if obj_path in path_to_obj:
-                                try:
-                                    additional_rels_key = get_obj_identifier(path_to_obj[obj_path])
-                                    for rel in rels_file.relationship:
-                                        # print(f"\t\t{rel.type_value}")
-                                        if (rel.type_value != EPCRelsRelationshipType.DESTINATION_OBJECT.get_type()
-                                                and rel.type_value != EPCRelsRelationshipType.SOURCE_OBJECT.get_type()
-                                                and rel.type_value != EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES.get_type()
-                                        ):  # not a computable relation
-                                            if additional_rels_key not in additional_rels:
-                                                additional_rels[additional_rels_key] = []
-                                            additional_rels[additional_rels_key].append(rel)
-                                except Exception as e:
-                                    print(f"Error with obj path {obj_path} {path_to_obj[obj_path]}")
-                                    raise e
-                            else:
-                                print(f"xml file {obj_path} not found in EPC (rels is not associate to any object)")
-
-        return Epc(energyml_objects=obj_list,
-                   raw_files=raw_file_list,
-                   core_props=core_props,
-                   additional_rels=additional_rels
-                   )
-    except zipfile.BadZipFile as error:
-        print(error)
-
-    return None
-
-
-
-

Methods

-
-
-def compute_rels(self) ‑> Dict[str, energyml.opc.opc.Relationships] -
-
-

Returns a dict containing for each objet, the rels xml file path as key and the RelationShips object as value -:return:

-
- -Expand source code - -
def compute_rels(self) -> Dict[str, Relationships]:
-    """
-    Returns a dict containing for each objet, the rels xml file path as key and the RelationShips object as value
-    :return:
-    """
-    dor_relation = get_reverse_dor_list(self.energyml_objects)
-
-    # destObject
-    rels = {
-        obj_id: [
-            Relationship(
-                target=gen_energyml_object_path(target_obj, self.export_version),
-                type_value=EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(),
-                id=f"_{obj_id}_{get_obj_type(target_obj)}_{get_obj_identifier(target_obj)}",
-            ) for target_obj in target_obj_list
-        ]
-        for obj_id, target_obj_list in dor_relation.items()
-    }
-    # sourceObject
-    for obj in self.energyml_objects:
-        obj_id = get_obj_identifier(obj)
-        if obj_id not in rels:
-            rels[obj_id] = []
-        for target_obj in get_direct_dor_list(obj):
-            rels[obj_id].append(Relationship(
-                target=gen_energyml_object_path(target_obj, self.export_version),
-                type_value=EPCRelsRelationshipType.SOURCE_OBJECT.get_type(),
-                id=f"_{obj_id}_{get_obj_type(target_obj)}_{get_obj_identifier(target_obj)}",
-            ))
-
-    map_obj_id_to_obj = {
-        get_obj_identifier(obj): obj
-        for obj in self.energyml_objects
-    }
-
-    obj_rels = {
-        gen_rels_path(energyml_object=map_obj_id_to_obj.get(obj_id), export_version=self.export_version): Relationships(
-            relationship=obj_rels + (self.additional_rels[obj_id] if obj_id in self.additional_rels else []),
-
-        )
-        for obj_id, obj_rels in rels.items()
-    }
-
-    # CoreProps
-    if self.core_props is not None:
-        obj_rels[gen_rels_path(self.core_props)] = Relationships(
-            relationship=[
-                Relationship(
-                    target=gen_core_props_path(),
-                    type_value=EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES.get_type(),
-                    id="CoreProperties"
-                )
-            ]
-        )
-
-    return obj_rels
-
-
-
-def export_file(self, path: Optional[str] = None) ‑> None -
-
-

Export the epc file. If :param:path is None, the epc 'self.epc_file_path' is used -:param path: -:return:

-
- -Expand source code - -
def export_file(self, path: Optional[str] = None) -> None:
-    """
-    Export the epc file. If :param:`path` is None, the epc 'self.epc_file_path' is used
-    :param path:
-    :return:
-    """
-    if path is None:
-        path = self.epc_file_path
-    epc_io = self.export_io()
-    with open(path, "wb") as f:
-        f.write(epc_io.getbuffer())
-
-
-
-def export_io(self) ‑> _io.BytesIO -
-
-

Export the epc file into a :class:BytesIO instance. The result is an 'in-memory' zip file. -:return:

-
- -Expand source code - -
def export_io(self) -> BytesIO:
-    """
-    Export the epc file into a :class:`BytesIO` instance. The result is an 'in-memory' zip file.
-    :return:
-    """
-    zip_buffer = BytesIO()
-
-    with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file:
-        #  Energyml objects
-        for e_obj in self.energyml_objects:
-            e_path = gen_energyml_object_path(e_obj, self.export_version)
-            zip_info = zipfile.ZipInfo(filename=e_path, date_time=datetime.datetime.now().timetuple()[:6])
-            data = serialize_xml(e_obj)
-            zip_file.writestr(zip_info, data)
-
-        # Rels
-        for rels_path, rels in self.compute_rels().items():
-            zip_info = zipfile.ZipInfo(filename=rels_path, date_time=datetime.datetime.now().timetuple()[:6])
-            data = serialize_xml(rels)
-            zip_file.writestr(zip_info, data)
-
-        # CoreProps
-        if self.core_props is not None:
-            zip_info = zipfile.ZipInfo(filename=gen_core_props_path(self.export_version),
-                                       date_time=datetime.datetime.now().timetuple()[:6])
-            data = serialize_xml(self.core_props)
-            zip_file.writestr(zip_info, data)
-
-        # ContentType
-        zip_info = zipfile.ZipInfo(filename=get_epc_content_type_path(),
-                                   date_time=datetime.datetime.now().timetuple()[:6])
-        data = serialize_xml(self.gen_opc_content_type())
-        zip_file.writestr(zip_info, data)
-
-    return zip_buffer
-
-
-
-def gen_opc_content_type(self) ‑> energyml.opc.opc.Types -
-
-

Generates a :class:Types instance and fill it with energyml objects :class:Override values -:return:

-
- -Expand source code - -
def gen_opc_content_type(self) -> Types:
-    """
-    Generates a :class:`Types` instance and fill it with energyml objects :class:`Override` values
-    :return:
-    """
-    ct = Types()
-    rels_default = Default()
-    rels_default.content_type = RELS_CONTENT_TYPE
-    rels_default.extension = "rels"
-
-    ct.default = [rels_default]
-
-    ct.override = []
-    for e_obj in self.energyml_objects:
-        ct.override.append(Override(
-            content_type=get_content_type_from_class(type(e_obj)),
-            part_name=gen_energyml_object_path(e_obj, self.export_version),
-        ))
-
-    if self.core_props is not None:
-        ct.override.append(Override(
-            content_type=get_content_type_from_class(self.core_props),
-            part_name=gen_core_props_path(self.export_version),
-        ))
-
-    return ct
-
-
-
-def get_epc_file_folder(self) ‑> Optional[str] -
-
-
-
- -Expand source code - -
def get_epc_file_folder(self) -> Optional[str]:
-    if self.epc_file_path is not None and len(self.epc_file_path) > 0:
-        folders_and_name = re.split(r"[\\/]", self.epc_file_path)
-        if len(folders_and_name) > 1:
-            return "/".join(folders_and_name[:-1])
-        else:
-            return ""
-    return None
-
-
-
-def get_object_by_identifier(self, identifier: str) ‑> Optional[Any] -
-
-

Search an object by its identifier. -:param identifier: given by the function :func:get_obj_identifier() -:return:

-
- -Expand source code - -
def get_object_by_identifier(self, identifier: str) -> Optional[Any]:
-    """
-    Search an object by its identifier.
-    :param identifier: given by the function :func:`get_obj_identifier`
-    :return:
-    """
-    for o in self.energyml_objects:
-        if get_obj_identifier(o) == identifier:
-            return o
-    return None
-
-
-
-def get_object_by_uuid(self, uuid: str) ‑> List[Any] -
-
-

Search all objects with the uuid :param:uuid. -:param uuid: -:return:

-
- -Expand source code - -
def get_object_by_uuid(self, uuid: str) -> List[Any]:
-    """
-    Search all objects with the uuid :param:`uuid`.
-    :param uuid:
-    :return:
-    """
-    return list(filter(lambda o: get_obj_uuid(o) == uuid, self.energyml_objects))
-
-
-
-
-
-class EpcExportVersion -(*args, **kwds) -
-
-

EPC export version.

-
- -Expand source code - -
class EpcExportVersion(Enum):
-    """EPC export version."""
-    #: Classical export
-    CLASSIC = 1
-    #: Export with objet path sorted by package (eml/resqml/witsml/prodml)
-    EXPANDED = 2
-
-

Ancestors

-
    -
  • enum.Enum
  • -
-

Class variables

-
-
var CLASSIC
-
-

Classical export

-
-
var EXPANDED
-
-

Export with objet path sorted by package (eml/resqml/witsml/prodml)

-
-
-
-
-class NoCrsException -(*args, **kwargs) -
-
-

Common base class for all non-exit exceptions.

-
- -Expand source code - -
class NoCrsException(Exception):
-    pass
-
-

Ancestors

-
    -
  • builtins.Exception
  • -
  • builtins.BaseException
  • -
-
-
-class ObjectNotFoundNotException -(obj_id: str = None) -
-
-

ObjectNotFoundNotException(obj_id: str = None)

-
- -Expand source code - -
@dataclass
-class ObjectNotFoundNotException(Exception):
-    obj_id: str = field(
-        default=None
-    )
-
-

Ancestors

-
    -
  • builtins.Exception
  • -
  • builtins.BaseException
  • -
-

Class variables

-
-
var obj_id : str
-
-
-
-
-
-
-class RawFile -(path: str = '_', content: _io.BytesIO = None) -
-
-

RawFile(path: str = '_', content: _io.BytesIO = None)

-
- -Expand source code - -
@dataclass
-class RawFile:
-    path: str = field(default="_")
-    content: BytesIO = field(default=None)
-
-

Class variables

-
-
var content : _io.BytesIO
-
-
-
-
var path : str
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/energyml-utils/docs/src/energyml/utils/index.html b/energyml-utils/docs/src/energyml/utils/index.html deleted file mode 100644 index b18f86d..0000000 --- a/energyml-utils/docs/src/energyml/utils/index.html +++ /dev/null @@ -1,140 +0,0 @@ - - - - - - -src.energyml.utils API documentation - - - - - - - - - - - -
-
-
-

Module src.energyml.utils

-
-
-

The energyml.utils module. -It contains tools for energyml management.

-

Please check the following module (depending on your need): -- energyml-opc -- energyml-common2-0 -- energyml-common2-1 -- energyml-common2-2 -- energyml-common2-3 -- energyml-resqml2-0-1 -- energyml-resqml2-2-dev3 -- energyml-resqml2-2 -- energyml-witsml2-0 -- energyml-witsml2-1 -- energyml-prodml2-0 -- energyml-prodml2-2

-
- -Expand source code - -
# Copyright (c) 2023-2024 Geosiris.
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-The energyml.utils module.
-It contains tools for energyml management.
-
-Please check the following module (depending on your need):
-    - energyml-opc
-    - energyml-common2-0
-    - energyml-common2-1
-    - energyml-common2-2
-    - energyml-common2-3
-    - energyml-resqml2-0-1
-    - energyml-resqml2-2-dev3
-    - energyml-resqml2-2
-    - energyml-witsml2-0
-    - energyml-witsml2-1
-    - energyml-prodml2-0
-    - energyml-prodml2-2
-"""
-
-
-
-

Sub-modules

-
-
src.energyml.utils.data
-
-

The data module …

-
-
src.energyml.utils.epc
-
-

This example module shows various types of documentation available for use -with pydoc. -To generate HTML documentation for this module issue the -…

-
-
src.energyml.utils.introspection
-
-
-
-
src.energyml.utils.manager
-
-
-
-
src.energyml.utils.serialization
-
-
-
-
src.energyml.utils.validation
-
-
-
-
src.energyml.utils.xml
-
-
-
-
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/energyml-utils/docs/src/energyml/utils/introspection.html b/energyml-utils/docs/src/energyml/utils/introspection.html deleted file mode 100644 index 23716e2..0000000 --- a/energyml-utils/docs/src/energyml/utils/introspection.html +++ /dev/null @@ -1,2141 +0,0 @@ - - - - - - -src.energyml.utils.introspection API documentation - - - - - - - - - - - -
-
-
-

Module src.energyml.utils.introspection

-
-
-
- -Expand source code - -
# Copyright (c) 2023-2024 Geosiris.
-# SPDX-License-Identifier: Apache-2.0
-import datetime
-import random
-import re
-import sys
-import typing
-import uuid as uuid_mod
-from dataclasses import Field
-from enum import Enum
-from importlib import import_module
-from typing import Any, List, Optional, Union, Dict, Tuple
-
-from .manager import get_class_pkg, get_class_pkg_version, RELATED_MODULES, \
-    get_related_energyml_modules_name, get_sub_classes, get_classes_matching_name
-from .xml import parse_content_type, ENERGYML_NAMESPACES
-
-
-primitives = (bool, str, int, float, type(None))
-
-
-def is_enum(cls: Union[type, Any]):
-    """
-    Returns True if :param:`cls` is an Enum
-    :param cls:
-    :return:
-    """
-    if isinstance(cls, type):
-        return Enum in cls.__bases__
-    return is_enum(type(cls))
-
-
-def is_primitive(cls: Union[type, Any]) -> bool:
-    """
-    Returns True if :param:`cls` is a primitiv type or extends Enum
-    :param cls:
-    :return: bool
-    """
-    if isinstance(cls, type):
-        return cls in primitives or Enum in cls.__bases__
-    return is_primitive(type(cls))
-
-
-def is_abstract(cls: Union[type, Any]) -> bool:
-    """
-    Returns True if :param:`cls` is an abstract class
-    :param cls:
-    :return: bool
-    """
-    if isinstance(cls, type):
-        return not is_primitive(cls) and (cls.__name__.startswith("Abstract") or (hasattr(cls, "__dataclass_fields__") and len(cls.__dataclass_fields__)) == 0) and len(get_class_methods(cls)) == 0
-    return is_abstract(type(cls))
-
-
-def get_class_methods(cls: Union[type, Any]) -> List[str]:
-    """
-    Returns the list of the methods names for a specific class.
-    :param cls:
-    :return:
-    """
-    return [func for func in dir(cls) if callable(getattr(cls, func)) and not func.startswith("__") and not isinstance(getattr(cls, func), type)]
-
-
-def get_class_from_name(class_name_and_module: str) -> Optional[type]:
-    """
-    Return a :class:`type` object matching with the name :param:`class_name_and_module`.
-    :param class_name_and_module:
-    :return:
-    """
-    module_name = class_name_and_module[: class_name_and_module.rindex(".")]
-    last_ns_part = class_name_and_module[
-                   class_name_and_module.rindex(".") + 1:
-                   ]
-    try:
-        # Required to read "CustomData" on eml objects that may contain resqml values
-        # ==> we need to import all modules related to the same version of the common
-        import_related_module(module_name)
-        return getattr(sys.modules[module_name], last_ns_part)
-    except AttributeError as e:
-        if "2d" in last_ns_part:
-            return get_class_from_name(
-                class_name_and_module.replace("2d", "2D")
-            )
-        elif "3d" in last_ns_part:
-            return get_class_from_name(
-                class_name_and_module.replace("3d", "3D")
-            )
-        elif last_ns_part[0].islower():
-            return get_class_from_name(
-                module_name + "." + last_ns_part[0].upper() + last_ns_part[1:]
-            )
-        else:
-            print(e)
-    return None
-
-
-def get_class_from_content_type(content_type: str) -> Optional[type]:
-    """
-    Return a :class:`type` object matching with the content-type :param:`content_type`.
-    :param content_type:
-    :return:
-    """
-    ct = parse_content_type(content_type)
-    domain = ct.group("domain")
-    if domain is None:
-        domain = "opc"
-    if domain == "opc":
-        xml_domain = ct.group("xmlDomain")
-        if "." in xml_domain:
-            xml_domain = xml_domain[xml_domain.rindex(".") + 1:]
-        if "extended" in xml_domain:
-            xml_domain = xml_domain.replace("extended", "")
-        opc_type = pascal_case(xml_domain)
-        # print("energyml.opc.opc." + opc_type)
-        return get_class_from_name("energyml.opc.opc." + opc_type)
-    else:
-        ns = ENERGYML_NAMESPACES[domain]
-        domain = ct.group("domain")
-        obj_type = ct.group("type")
-        if obj_type.lower().startswith("obj_"):  # for resqml201
-            obj_type = "Obj" + obj_type[4:]
-        version_num = str(ct.group("domainVersion")).replace(".", "_")
-        if domain.lower() == "resqml" and version_num.startswith("2_0"):
-            version_num = "2_0_1"
-        return get_class_from_name(
-            "energyml."
-            + domain
-            + ".v"
-            + version_num
-            + "."
-            + ns[ns.rindex("/") + 1:]
-            + "."
-            + obj_type
-        )
-
-
-def snake_case(s: str) -> str:
-    """ Transform a str into snake case. """
-    s = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', s)
-    s = re.sub('__([A-Z])', r'_\1', s)
-    s = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s)
-    return s.lower()
-
-
-def pascal_case(s: str) -> str:
-    """ Transform a str into pascal case. """
-    return snake_case(s).replace("_", " ").title().replace(" ", "")
-
-
-def flatten_concatenation(matrix) -> List:
-    """
-    Flatten a matrix.
-
-    Example :
-        [ [a,b,c], [d,e,f], [ [x,y,z], [0] ] ]
-        will be translated in: [a, b, c, d, e, f, [x,y,z], [0]]
-    :param matrix:
-    :return:
-    """
-    flat_list = []
-    for row in matrix:
-        flat_list += row
-    return flat_list
-
-
-def import_related_module(energyml_module_name: str) -> None:
-    """
-    Import related modules for a specific energyml module. (See. :const:`RELATED_MODULES`)
-    :param energyml_module_name:
-    :return:
-    """
-    for related in RELATED_MODULES:
-        if energyml_module_name in related:
-            for m in related:
-                try:
-                    import_module(m)
-                except Exception as e:
-                    pass
-                    # print(e)
-
-
-def get_class_fields(cls: Union[type, Any]) -> Dict[str, Field]:
-    """
-    Return all class fields names, mapped to their :class:`Field` value.
-    :param cls:
-    :return:
-    """
-    if not isinstance(cls, type):  # if cls is an instance
-        cls = type(cls)
-    try:
-        return cls.__dataclass_fields__
-    except AttributeError:
-        return {}
-
-
-def get_class_attributes(cls: Union[type, Any]) -> List[str]:
-    """
-    returns a list of attributes (not private ones)
-    """
-    # if not isinstance(cls, type):  # if cls is an instance
-    #     cls = type(cls)
-    # return list(filter(lambda a: not a.startswith("__"), dir(cls)))
-    return list(get_class_fields(cls).keys())
-
-
-def get_matching_class_attribute_name(
-        cls: Union[type, Any], attribute_name: str, re_flags=re.IGNORECASE,
-) -> Optional[str]:
-    """
-    From an object and an attribute name, returns the correct attribute name of the class.
-    Example : "ObjectVersion" --> object_version.
-    This method doesn't only transform to snake case but search into the obj class attributes
-    """
-    class_fields = get_class_fields(cls)
-
-    # a search with the exact value
-    for name, cf in class_fields.items():
-        if (
-                snake_case(name) == snake_case(attribute_name)
-                or ('name' in cf.metadata and cf.metadata['name'] == attribute_name)
-        ):
-            return name
-
-    # search regex after to avoid shadowing perfect match
-    pattern = re.compile(attribute_name, flags=re_flags)
-    for name, cf in class_fields.items():
-        # print(f"\t->{name} : {attribute_name} {pattern.match(name)} {('name' in cf.metadata and pattern.match(cf.metadata['name']))}")
-        if pattern.match(name) or ('name' in cf.metadata and pattern.match(cf.metadata['name'])):
-            return name
-
-    return None
-
-
-def get_object_attribute(
-        obj: Any, attr_dot_path: str, force_snake_case=True
-) -> Any:
-    """
-    returns the value of an attribute given by a dot representation of its path in the object
-    example "Citation.Title"
-    """
-    while attr_dot_path.startswith("."):  # avoid '.Citation.Title' to take an empty attribute name before the first '.'
-        attr_dot_path = attr_dot_path[1:]
-
-    current_attrib_name = attr_dot_path
-
-    if "." in attr_dot_path:
-        current_attrib_name = attr_dot_path.split(".")[0]
-
-    if force_snake_case:
-        current_attrib_name = snake_case(current_attrib_name)
-
-    value = None
-    if isinstance(obj, list):
-        value = obj[int(current_attrib_name)]
-    elif isinstance(obj, dict):
-        value = obj[current_attrib_name]
-    else:
-        value = getattr(obj, current_attrib_name)
-
-    if "." in attr_dot_path:
-        return get_object_attribute(
-            value, attr_dot_path[len(current_attrib_name) + 1:]
-        )
-    else:
-        return value
-
-
-def get_object_attribute_advanced(obj: Any, attr_dot_path: str) -> Any:
-    """
-    see @get_matching_class_attribute_name and @get_object_attribute
-    """
-    current_attrib_name = attr_dot_path
-
-    if "." in attr_dot_path:
-        current_attrib_name = attr_dot_path.split(".")[0]
-
-    current_attrib_name = get_matching_class_attribute_name(
-        obj, current_attrib_name
-    )
-
-    value = None
-    if isinstance(obj, list):
-        value = obj[int(current_attrib_name)]
-    elif isinstance(obj, dict):
-        value = obj[current_attrib_name]
-    else:
-        value = getattr(obj, current_attrib_name)
-
-    if "." in attr_dot_path:
-        return get_object_attribute_advanced(
-            value, attr_dot_path[len(current_attrib_name) + 1:]
-        )
-    else:
-        return value
-
-
-def get_object_attribute_no_verif(obj: Any, attr_name: str) -> Any:
-    """
-    Return the value of the attribute named after param :param:`attr_name` without verification (may raise an exception
-    if it doesn't exists).
-
-    Note: attr_name="0" will work if :param:`obj` is of type :class:`List`
-    :param obj:
-    :param attr_name:
-    :return:
-    """
-    if isinstance(obj, list):
-        return obj[int(attr_name)]
-    elif isinstance(obj, dict):
-        return obj[attr_name]
-    else:
-        return getattr(obj, attr_name)
-
-
-def get_object_attribute_rgx(obj: Any, attr_dot_path_rgx: str) -> Any:
-    """
-    see @get_object_attribute. Search the attribute name using regex for values between dots.
-    Example : [Cc]itation.[Tt]it\\.*
-    """
-    current_attrib_name = attr_dot_path_rgx
-
-    attrib_list = re.split(r"(?<!\\)\.+", attr_dot_path_rgx)
-
-    if len(attrib_list) > 0:
-        current_attrib_name = attrib_list[0]
-
-    # unescape Dot
-    current_attrib_name = current_attrib_name.replace("\\.", ".")
-
-    real_attrib_name = get_matching_class_attribute_name(
-        obj, current_attrib_name
-    )
-    if real_attrib_name is not None:
-        value = get_object_attribute_no_verif(obj, real_attrib_name)
-
-        if len(attrib_list) > 1:
-            return get_object_attribute_rgx(
-                value, attr_dot_path_rgx[len(current_attrib_name) + 1:]
-            )
-        else:
-            return value
-    return None
-
-
-def get_obj_type(obj: Any) -> str:
-    """ Return the type name of an object. If obj is already a :class:`type`, return its __name__"""
-    if isinstance(obj, type):
-        return str(obj.__name__)
-    return get_obj_type(type(obj))
-
-
-def class_match_rgx(
-        cls: Union[type, Any],
-        rgx: str,
-        super_class_search: bool = True,
-        re_flags=re.IGNORECASE,
-):
-    if not isinstance(cls, type):
-        cls = type(cls)
-
-    if re.match(rgx, cls.__name__, re_flags):
-        return True
-
-    if not is_primitive(cls) and super_class_search:
-        for base in cls.__bases__:
-            if class_match_rgx(base, rgx, super_class_search, re_flags):
-                return True
-    return False
-
-
-def search_attribute_matching_type_with_path(
-        obj: Any,
-        type_rgx: str,
-        re_flags=re.IGNORECASE,
-        return_self: bool = True,  # test directly on input object and not only in its attributes
-        deep_search: bool = True,  # Search inside a matching object
-        super_class_search: bool = True,  # Search inside in super classes of the object
-        current_path: str = "",
-) -> List[Tuple[str, Any]]:
-    """
-    Returns a list of tuple (path, value) for each sub attribute with type matching param "type_rgx".
-    The path is a dot-version like ".Citation.Title"
-    :param obj:
-    :param type_rgx:
-    :param re_flags:
-    :param return_self:
-    :param deep_search:
-    :param super_class_search:
-    :param current_path:
-    :return:
-    """
-    res = []
-    if obj is not None:
-        if return_self and class_match_rgx(
-                obj, type_rgx, super_class_search, re_flags
-        ):
-            res.append((current_path, obj))
-            if not deep_search:
-                return res
-
-    if isinstance(obj, list):
-        cpt = 0
-        for s_o in obj:
-            res = res + search_attribute_matching_type_with_path(
-                obj=s_o,
-                type_rgx=type_rgx,
-                re_flags=re_flags,
-                return_self=True,
-                deep_search=deep_search,
-                current_path=f"{current_path}.{cpt}",
-                super_class_search=super_class_search,
-            )
-            cpt = cpt + 1
-    elif isinstance(obj, dict):
-        for k, s_o in obj.items():
-            res = res + search_attribute_matching_type_with_path(
-                obj=s_o,
-                type_rgx=type_rgx,
-                re_flags=re_flags,
-                return_self=True,
-                deep_search=deep_search,
-                current_path=f"{current_path}.{k}",
-                super_class_search=super_class_search,
-            )
-    elif not is_primitive(obj):
-        for att_name in get_class_attributes(obj):
-            res = res + search_attribute_matching_type_with_path(
-                obj=get_object_attribute_rgx(obj, att_name),
-                type_rgx=type_rgx,
-                re_flags=re_flags,
-                return_self=True,
-                deep_search=deep_search,
-                current_path=f"{current_path}.{att_name}",
-                super_class_search=super_class_search,
-            )
-
-    return res
-
-
-def search_attribute_in_upper_matching_name(
-        obj: Any,
-        name_rgx: str,
-        root_obj: Optional[Any] = None,
-        re_flags=re.IGNORECASE,
-        current_path: str = "",
-) -> Optional[Any]:
-    """
-    See :func:`search_attribute_matching_type_with_path`. It only returns the value not the path
-    :param obj:
-    :param name_rgx:
-    :param root_obj:
-    :param re_flags:
-    :param current_path:
-    :return:
-    """
-    elt_list = search_attribute_matching_name(obj, name_rgx, search_in_sub_obj=False, deep_search=False)
-    if elt_list is not None and len(elt_list) > 0:
-        return elt_list
-
-    if obj != root_obj:
-        upper_path = current_path[:current_path.rindex(".")]
-        if len(upper_path) > 0:
-            return search_attribute_in_upper_matching_name(
-                obj=get_object_attribute(root_obj, upper_path),
-                name_rgx=name_rgx,
-                root_obj=root_obj,
-                re_flags=re_flags,
-                current_path=upper_path,
-            )
-
-    return None
-
-
-def search_attribute_matching_type(
-        obj: Any,
-        type_rgx: str,
-        re_flags=re.IGNORECASE,
-        return_self: bool = True,  # test directly on input object and not only in its attributes
-        deep_search: bool = True,  # Search inside a matching object
-        super_class_search: bool = True,  # Search inside in super classes of the object
-) -> List[Any]:
-    """
-    See :func:`search_attribute_matching_type_with_path`. It only returns the value not the path
-    :param obj:
-    :param type_rgx:
-    :param re_flags:
-    :param return_self:
-    :param deep_search:
-    :param super_class_search:
-    :return:
-    """
-    return [
-        val
-        for path, val in search_attribute_matching_type_with_path(
-            obj=obj,
-            type_rgx=type_rgx,
-            re_flags=re_flags,
-            return_self=return_self,
-            deep_search=deep_search,
-            super_class_search=super_class_search,
-        )
-    ]
-
-
-def search_attribute_matching_name_with_path(
-        obj: Any,
-        name_rgx: str,
-        re_flags=re.IGNORECASE,
-        current_path: str = "",
-        deep_search: bool = True,  # Search inside a matching object
-        search_in_sub_obj: bool = True,  # Search in obj attributes
-) -> List[Tuple[str, Any]]:
-    """
-    Returns a list of tuple (path, value) for each sub attribute with type matching param "name_rgx".
-    The path is a dot-version like ".Citation.Title"
-    :param obj:
-    :param name_rgx:
-    :param re_flags:
-    :param current_path:
-    :param deep_search:
-    :param search_in_sub_obj:
-    :return:
-    """
-    while name_rgx.startswith("."):
-        name_rgx = name_rgx[1:]
-    current_match = name_rgx
-    next_match = current_match
-    if '.' in current_match:
-        attrib_list = re.split(r"(?<!\\)\.+", name_rgx)
-        current_match = attrib_list[0]
-        next_match = '.'.join(attrib_list[1:])
-
-    res = []
-
-    match_value = None
-    match_path_and_obj = []
-    not_match_path_and_obj = []
-    if isinstance(obj, list):
-        cpt = 0
-        for s_o in obj:
-            match = re.match(current_match.replace("\\.", "."), str(cpt), flags=re_flags)
-            if match is not None:
-                match_value = match.group(0)
-                match_path_and_obj.append( (f"{current_path}.{cpt}", s_o) )
-            else:
-                not_match_path_and_obj.append( (f"{current_path}.{cpt}", s_o) )
-            cpt = cpt + 1
-    elif isinstance(obj, dict):
-        for k, s_o in obj.items():
-            match = re.match(current_match.replace("\\.", "."), k, flags=re_flags)
-            if match is not None:
-                match_value = match.group(0)
-                match_path_and_obj.append( (f"{current_path}.{k}", s_o) )
-            else:
-                not_match_path_and_obj.append( (f"{current_path}.{k}", s_o) )
-    elif not is_primitive(obj):
-        match_value = get_matching_class_attribute_name(obj, current_match.replace("\\.", "."))
-        if match_value is not None:
-            match_path_and_obj.append( (f"{current_path}.{match_value}", get_object_attribute_no_verif(obj, match_value)) )
-        for att_name in get_class_attributes(obj):
-            if att_name != match_value:
-                not_match_path_and_obj.append( (f"{current_path}.{att_name}", get_object_attribute_no_verif(obj, att_name)) )
-
-    for matched_path, matched in match_path_and_obj:
-        if next_match != current_match and len(next_match) > 0:  # next_match is different, match is not final
-            res = res + search_attribute_matching_name_with_path(
-                obj=matched,
-                name_rgx=next_match,
-                re_flags=re_flags,
-                current_path=matched_path,
-                deep_search=False,  # no deep with partial
-                search_in_sub_obj=False,  # no partial search in sub obj with no match
-            )
-        else:  # a complete match
-            res.append( (matched_path, matched) )
-            if deep_search:
-                res = res + search_attribute_matching_name_with_path(
-                    obj=matched,
-                    name_rgx=name_rgx,
-                    re_flags=re_flags,
-                    current_path=matched_path,
-                    deep_search=deep_search,  # no deep with partial
-                    search_in_sub_obj=True,
-                )
-    if search_in_sub_obj:
-        for not_matched_path, not_matched in not_match_path_and_obj:
-            res = res + search_attribute_matching_name_with_path(
-                obj=not_matched,
-                name_rgx=name_rgx,
-                re_flags=re_flags,
-                current_path=not_matched_path,
-                deep_search=deep_search,
-                search_in_sub_obj=True,
-            )
-
-    return res
-
-
-def search_attribute_matching_name(
-        obj: Any,
-        name_rgx: str,
-        re_flags=re.IGNORECASE,
-        deep_search: bool = True,  # Search inside a matching object
-        search_in_sub_obj: bool = True,  # Search in obj attributes
-) -> List[Any]:
-    """
-    See :func:`search_attribute_matching_name_with_path`. It only returns the value not the path
-
-    :param obj:
-    :param name_rgx:
-    :param re_flags:
-    :param deep_search:
-    :param search_in_sub_obj:
-    :return:
-    """
-    return [
-        val
-        for path, val in search_attribute_matching_name_with_path(
-            obj=obj,
-            name_rgx=name_rgx,
-            re_flags=re_flags,
-            deep_search=deep_search,
-            search_in_sub_obj=search_in_sub_obj
-        )
-    ]
-
-
-# Utility functions
-
-
-def gen_uuid() -> str:
-    """
-    Generate a new uuid.
-    :return:
-    """
-    return str(uuid_mod.uuid4())
-
-
-def get_obj_uuid(obj: Any) -> str:
-    """
-    Return the object uuid (attribute must match the following regex : "[Uu]u?id|UUID").
-    :param obj:
-    :return:
-    """
-    return get_object_attribute_rgx(obj, "[Uu]u?id|UUID")
-
-
-def get_obj_version(obj: Any) -> str:
-    """
-    Return the object version (check for "object_version" or "version_string" attribute).
-    :param obj:
-    :return:
-    """
-    try:
-        return get_object_attribute_no_verif(obj, "object_version")
-    except AttributeError as e:
-        try:
-            return get_object_attribute_no_verif(obj, "version_string")
-        except Exception:
-            print(f"Error with {type(obj)}")
-            raise e
-
-
-def get_direct_dor_list(obj: Any) -> List[Any]:
-    """
-    Search all sub attribute of type "DataObjectreference".
-    :param obj:
-    :return:
-    """
-    return search_attribute_matching_type(obj, "DataObjectreference")
-
-
-def get_data_object_type(cls: Union[type, Any], print_dev_version=True, nb_max_version_digits=2):
-    return get_class_pkg(cls) + "." + get_class_pkg_version(cls, print_dev_version, nb_max_version_digits)
-
-
-def get_qualified_type_from_class(cls: Union[type, Any], print_dev_version=True):
-    return (
-            get_data_object_type(cls, print_dev_version, 2)
-            .replace(".", "") + "." + get_object_type_for_file_path_from_class(cls)
-    )
-
-
-def get_content_type_from_class(cls: Union[type, Any], print_dev_version=True, nb_max_version_digits=2):
-    if not isinstance(cls, type):
-        cls = type(cls)
-
-    if ".opc." in cls.__module__:
-        if cls.__name__.lower() == "coreproperties":
-            return "application/vnd.openxmlformats-package.core-properties+xml"
-    else:
-        return ("application/x-" + get_class_pkg(cls)
-                + "+xml;version=" + get_class_pkg_version(cls, print_dev_version, nb_max_version_digits) + ";type="
-                + get_object_type_for_file_path_from_class(cls))
-
-    print(f"@get_content_type_from_class not supported type : {cls}")
-    return None
-
-
-def get_object_type_for_file_path_from_class(cls) -> str:
-    # obj_type = get_obj_type(cls)
-    # pkg = get_class_pkg(cls)
-    # if re.match(r"Obj[A-Z].*", obj_type) is not None and pkg == "resqml":
-    #     return "obj_" + obj_type[3:]
-    # return obj_type
-
-    try:
-        return cls.Meta.name  # to work with 3d transformed in 3D and Obj[A-Z] in obj_[A-Z]
-    except AttributeError:
-        pkg = get_class_pkg(cls)
-        return get_obj_type(cls)
-
-
-def now(time_zone=datetime.timezone(datetime.timedelta(hours=1), "UTC")) -> int:
-    """ Return an epoch value """
-    return int(datetime.datetime.timestamp(datetime.datetime.now(time_zone)))
-
-
-def epoch(time_zone=datetime.timezone(datetime.timedelta(hours=1), "UTC")) -> int:
-    return int(now(time_zone))
-
-
-def date_to_epoch(date: str) -> int:
-    """
-    Transform a energyml date into an epoch datetime
-    :return: int
-    """
-    return int(datetime.datetime.fromisoformat(date).timestamp())
-
-
-def epoch_to_date(epoch_value: int, time_zone=datetime.timezone(datetime.timedelta(hours=1), "UTC")) -> str:
-    date = datetime.datetime.fromtimestamp(epoch_value / 1e3, time_zone)
-    return date.strftime("%Y-%m-%dT%H:%M:%S%z")
-
-
-#  RANDOM
-
-
-def get_class_from_simple_name(simple_name: str, energyml_module_context=None) -> type:
-    """
-    Search for a :class:`type` depending on the simple class name :param:`simple_name`.
-    :param simple_name:
-    :param energyml_module_context:
-    :return:
-    """
-    if energyml_module_context is None:
-        energyml_module_context = []
-    try:
-        return eval(simple_name)
-    except NameError as e:
-        for mod in energyml_module_context:
-            try:
-                exec(f"from {mod} import *")
-                # required to be able to access to type in
-                # typing values like "List[ObjectAlias]"
-            except ModuleNotFoundError:
-                pass
-        return eval(simple_name)
-
-
-def _gen_str_from_attribute_name(attribute_name: Optional[str], _parent_class: Optional[type]=None) -> str:
-    """
-    Generate a str from the attribute name. The result is not the same for an attribute named "Uuid" than for an
-    attribute named "mime_type" for example.
-    :param attribute_name:
-    :param _parent_class:
-    :return:
-    """
-    attribute_name_lw = attribute_name.lower()
-    if attribute_name is not None:
-        if attribute_name_lw == "uuid" or attribute_name_lw == "uid":
-            return gen_uuid()
-        elif attribute_name_lw == "title":
-            return f"{_parent_class.__name__} title (" + str(random_value_from_class(int)) + ")"
-        elif attribute_name_lw == "schema_version" and get_class_pkg_version(_parent_class) is not None:
-            return get_class_pkg_version(_parent_class)
-        elif re.match(r"\w*version$", attribute_name_lw):
-            return str(random_value_from_class(int))
-        elif re.match(r"\w*date_.*", attribute_name_lw):
-            return epoch_to_date(epoch())
-        elif re.match(r"path_in_.*", attribute_name_lw):
-            return f"/FOLDER/{gen_uuid()}/a_patch{random.randint(0, 30)}"
-        elif "mime_type" in attribute_name_lw and ("external" in _parent_class.__name__.lower() and "part" in _parent_class.__name__.lower()):
-            return f"application/x-hdf5"
-        elif "type" in attribute_name_lw:
-            if attribute_name_lw.startswith("qualified"):
-                return get_qualified_type_from_class(get_classes_matching_name(_parent_class, "Abstract")[0])
-            if attribute_name_lw.startswith("content"):
-                return get_content_type_from_class(get_classes_matching_name(_parent_class, "Abstract")[0])
-    return "A random str " + (f"[{attribute_name}] " if attribute_name is not None else "") + "(" + str(
-        random_value_from_class(int)) + ")"
-
-
-def random_value_from_class(cls: type):
-    """
-    Generate a random value for a :class:`type`. All attributes should be filled with random values.
-    :param cls:
-    :return:
-    """
-    energyml_module_context = []
-    if not is_primitive(cls):
-        # import_related_module(cls.__module__)
-        energyml_module_context = get_related_energyml_modules_name(cls)
-    return _random_value_from_class(cls=cls, energyml_module_context=energyml_module_context, attribute_name=None)
-
-
-def _random_value_from_class(cls: Any, energyml_module_context: List[str], attribute_name: Optional[str] = None, _parent_class: Optional[type]=None):
-    """
-    Generate a random value for a :class:`type`. All attributes should be filled with random values.
-    :param cls:
-    :param energyml_module_context:
-    :param attribute_name:
-    :param _parent_class: the :class:`type`of the parent object
-    :return:
-    """
-
-    try:
-        if isinstance(cls, str) or cls == str:
-            return _gen_str_from_attribute_name(attribute_name, _parent_class)
-        elif isinstance(cls, int) or cls == int:
-            return random.randint(0, 10000)
-        elif isinstance(cls, float) or cls == float:
-            return random.randint(0, 1000000) / 100.
-        elif isinstance(cls, bool) or cls == bool:
-            return random.randint(0, 1) == 1
-        elif is_enum(cls):
-            return cls[cls._member_names_[random.randint(0, len(cls._member_names_) - 1)]]
-        elif isinstance(cls, typing.Union.__class__):
-            type_list = list(cls.__args__)
-            if type(None) in type_list:
-                type_list.remove(type(None))  # we don't want to generate none value
-            chosen_type = type_list[random.randint(0, len(type_list))]
-            return _random_value_from_class(chosen_type, energyml_module_context, attribute_name, cls)
-        elif cls.__module__ == 'typing':
-            nb_value_for_list = random.randint(2, 3)
-            type_list = list(cls.__args__)
-            if type(None) in type_list:
-                type_list.remove(type(None))  # we don't want to generate none value
-
-            if cls._name == "List":
-                lst = []
-                for i in range(nb_value_for_list):
-                    chosen_type = type_list[random.randint(0, len(type_list) - 1)]
-                    lst.append(_random_value_from_class(chosen_type, energyml_module_context, attribute_name, list))
-                return lst
-            else:
-                chosen_type = type_list[random.randint(0, len(type_list) - 1)]
-                return _random_value_from_class(chosen_type, energyml_module_context, attribute_name, _parent_class)
-        else:
-            potential_classes = list(filter(lambda _c: not is_abstract(_c), [cls] + get_sub_classes(cls)))
-            if len(potential_classes) > 0:
-                chosen_type = potential_classes[random.randint(0, len(potential_classes) - 1)]
-                args = {}
-                for k, v in get_class_fields(chosen_type).items():
-                    # print(f"get_class_fields {k} : {v}")
-                    args[k] = _random_value_from_class(
-                        cls=get_class_from_simple_name(simple_name=v.type, energyml_module_context=energyml_module_context),
-                        energyml_module_context=energyml_module_context,
-                        attribute_name=k,
-                        _parent_class=chosen_type)
-
-                if not isinstance(chosen_type, type):
-                    chosen_type = type(chosen_type)
-                return chosen_type(**args)
-
-    except Exception as e:
-        print(f"exception on attribute '{attribute_name}' for class {cls} :")
-        raise e
-
-    print(f"@_random_value_from_class Not supported object type generation {cls}")
-    return None
-
-
-
-
-
-
-
-

Functions

-
-
-def class_match_rgx(cls: Union[type, Any], rgx: str, super_class_search: bool = True, re_flags=re.IGNORECASE) -
-
-
-
- -Expand source code - -
def class_match_rgx(
-        cls: Union[type, Any],
-        rgx: str,
-        super_class_search: bool = True,
-        re_flags=re.IGNORECASE,
-):
-    if not isinstance(cls, type):
-        cls = type(cls)
-
-    if re.match(rgx, cls.__name__, re_flags):
-        return True
-
-    if not is_primitive(cls) and super_class_search:
-        for base in cls.__bases__:
-            if class_match_rgx(base, rgx, super_class_search, re_flags):
-                return True
-    return False
-
-
-
-def date_to_epoch(date: str) ‑> int -
-
-

Transform a energyml date into an epoch datetime -:return: int

-
- -Expand source code - -
def date_to_epoch(date: str) -> int:
-    """
-    Transform a energyml date into an epoch datetime
-    :return: int
-    """
-    return int(datetime.datetime.fromisoformat(date).timestamp())
-
-
-
-def epoch(time_zone=datetime.timezone(datetime.timedelta(seconds=3600), 'UTC')) ‑> int -
-
-
-
- -Expand source code - -
def epoch(time_zone=datetime.timezone(datetime.timedelta(hours=1), "UTC")) -> int:
-    return int(now(time_zone))
-
-
-
-def epoch_to_date(epoch_value: int, time_zone=datetime.timezone(datetime.timedelta(seconds=3600), 'UTC')) ‑> str -
-
-
-
- -Expand source code - -
def epoch_to_date(epoch_value: int, time_zone=datetime.timezone(datetime.timedelta(hours=1), "UTC")) -> str:
-    date = datetime.datetime.fromtimestamp(epoch_value / 1e3, time_zone)
-    return date.strftime("%Y-%m-%dT%H:%M:%S%z")
-
-
-
-def flatten_concatenation(matrix) ‑> List -
-
-

Flatten a matrix.

-

Example : -[ [a,b,c], [d,e,f], [ [x,y,z], [0] ] ] -will be translated in: [a, b, c, d, e, f, [x,y,z], [0]] -:param matrix: -:return:

-
- -Expand source code - -
def flatten_concatenation(matrix) -> List:
-    """
-    Flatten a matrix.
-
-    Example :
-        [ [a,b,c], [d,e,f], [ [x,y,z], [0] ] ]
-        will be translated in: [a, b, c, d, e, f, [x,y,z], [0]]
-    :param matrix:
-    :return:
-    """
-    flat_list = []
-    for row in matrix:
-        flat_list += row
-    return flat_list
-
-
-
-def gen_uuid() ‑> str -
-
-

Generate a new uuid. -:return:

-
- -Expand source code - -
def gen_uuid() -> str:
-    """
-    Generate a new uuid.
-    :return:
-    """
-    return str(uuid_mod.uuid4())
-
-
-
-def get_class_attributes(cls: Union[type, Any]) ‑> List[str] -
-
-

returns a list of attributes (not private ones)

-
- -Expand source code - -
def get_class_attributes(cls: Union[type, Any]) -> List[str]:
-    """
-    returns a list of attributes (not private ones)
-    """
-    # if not isinstance(cls, type):  # if cls is an instance
-    #     cls = type(cls)
-    # return list(filter(lambda a: not a.startswith("__"), dir(cls)))
-    return list(get_class_fields(cls).keys())
-
-
-
-def get_class_fields(cls: Union[type, Any]) ‑> Dict[str, dataclasses.Field] -
-
-

Return all class fields names, mapped to their :class:Field value. -:param cls: -:return:

-
- -Expand source code - -
def get_class_fields(cls: Union[type, Any]) -> Dict[str, Field]:
-    """
-    Return all class fields names, mapped to their :class:`Field` value.
-    :param cls:
-    :return:
-    """
-    if not isinstance(cls, type):  # if cls is an instance
-        cls = type(cls)
-    try:
-        return cls.__dataclass_fields__
-    except AttributeError:
-        return {}
-
-
-
-def get_class_from_content_type(content_type: str) ‑> Optional[type] -
-
-

Return a :class:type object matching with the content-type :param:content_type. -:param content_type: -:return:

-
- -Expand source code - -
def get_class_from_content_type(content_type: str) -> Optional[type]:
-    """
-    Return a :class:`type` object matching with the content-type :param:`content_type`.
-    :param content_type:
-    :return:
-    """
-    ct = parse_content_type(content_type)
-    domain = ct.group("domain")
-    if domain is None:
-        domain = "opc"
-    if domain == "opc":
-        xml_domain = ct.group("xmlDomain")
-        if "." in xml_domain:
-            xml_domain = xml_domain[xml_domain.rindex(".") + 1:]
-        if "extended" in xml_domain:
-            xml_domain = xml_domain.replace("extended", "")
-        opc_type = pascal_case(xml_domain)
-        # print("energyml.opc.opc." + opc_type)
-        return get_class_from_name("energyml.opc.opc." + opc_type)
-    else:
-        ns = ENERGYML_NAMESPACES[domain]
-        domain = ct.group("domain")
-        obj_type = ct.group("type")
-        if obj_type.lower().startswith("obj_"):  # for resqml201
-            obj_type = "Obj" + obj_type[4:]
-        version_num = str(ct.group("domainVersion")).replace(".", "_")
-        if domain.lower() == "resqml" and version_num.startswith("2_0"):
-            version_num = "2_0_1"
-        return get_class_from_name(
-            "energyml."
-            + domain
-            + ".v"
-            + version_num
-            + "."
-            + ns[ns.rindex("/") + 1:]
-            + "."
-            + obj_type
-        )
-
-
-
-def get_class_from_name(class_name_and_module: str) ‑> Optional[type] -
-
-

Return a :class:type object matching with the name :param:class_name_and_module. -:param class_name_and_module: -:return:

-
- -Expand source code - -
def get_class_from_name(class_name_and_module: str) -> Optional[type]:
-    """
-    Return a :class:`type` object matching with the name :param:`class_name_and_module`.
-    :param class_name_and_module:
-    :return:
-    """
-    module_name = class_name_and_module[: class_name_and_module.rindex(".")]
-    last_ns_part = class_name_and_module[
-                   class_name_and_module.rindex(".") + 1:
-                   ]
-    try:
-        # Required to read "CustomData" on eml objects that may contain resqml values
-        # ==> we need to import all modules related to the same version of the common
-        import_related_module(module_name)
-        return getattr(sys.modules[module_name], last_ns_part)
-    except AttributeError as e:
-        if "2d" in last_ns_part:
-            return get_class_from_name(
-                class_name_and_module.replace("2d", "2D")
-            )
-        elif "3d" in last_ns_part:
-            return get_class_from_name(
-                class_name_and_module.replace("3d", "3D")
-            )
-        elif last_ns_part[0].islower():
-            return get_class_from_name(
-                module_name + "." + last_ns_part[0].upper() + last_ns_part[1:]
-            )
-        else:
-            print(e)
-    return None
-
-
-
-def get_class_from_simple_name(simple_name: str, energyml_module_context=None) ‑> type -
-
-

Search for a :class:type depending on the simple class name :param:simple_name. -:param simple_name: -:param energyml_module_context: -:return:

-
- -Expand source code - -
def get_class_from_simple_name(simple_name: str, energyml_module_context=None) -> type:
-    """
-    Search for a :class:`type` depending on the simple class name :param:`simple_name`.
-    :param simple_name:
-    :param energyml_module_context:
-    :return:
-    """
-    if energyml_module_context is None:
-        energyml_module_context = []
-    try:
-        return eval(simple_name)
-    except NameError as e:
-        for mod in energyml_module_context:
-            try:
-                exec(f"from {mod} import *")
-                # required to be able to access to type in
-                # typing values like "List[ObjectAlias]"
-            except ModuleNotFoundError:
-                pass
-        return eval(simple_name)
-
-
-
-def get_class_methods(cls: Union[type, Any]) ‑> List[str] -
-
-

Returns the list of the methods names for a specific class. -:param cls: -:return:

-
- -Expand source code - -
def get_class_methods(cls: Union[type, Any]) -> List[str]:
-    """
-    Returns the list of the methods names for a specific class.
-    :param cls:
-    :return:
-    """
-    return [func for func in dir(cls) if callable(getattr(cls, func)) and not func.startswith("__") and not isinstance(getattr(cls, func), type)]
-
-
-
-def get_content_type_from_class(cls: Union[type, Any], print_dev_version=True, nb_max_version_digits=2) -
-
-
-
- -Expand source code - -
def get_content_type_from_class(cls: Union[type, Any], print_dev_version=True, nb_max_version_digits=2):
-    if not isinstance(cls, type):
-        cls = type(cls)
-
-    if ".opc." in cls.__module__:
-        if cls.__name__.lower() == "coreproperties":
-            return "application/vnd.openxmlformats-package.core-properties+xml"
-    else:
-        return ("application/x-" + get_class_pkg(cls)
-                + "+xml;version=" + get_class_pkg_version(cls, print_dev_version, nb_max_version_digits) + ";type="
-                + get_object_type_for_file_path_from_class(cls))
-
-    print(f"@get_content_type_from_class not supported type : {cls}")
-    return None
-
-
-
-def get_data_object_type(cls: Union[type, Any], print_dev_version=True, nb_max_version_digits=2) -
-
-
-
- -Expand source code - -
def get_data_object_type(cls: Union[type, Any], print_dev_version=True, nb_max_version_digits=2):
-    return get_class_pkg(cls) + "." + get_class_pkg_version(cls, print_dev_version, nb_max_version_digits)
-
-
-
-def get_direct_dor_list(obj: Any) ‑> List[Any] -
-
-

Search all sub attribute of type "DataObjectreference". -:param obj: -:return:

-
- -Expand source code - -
def get_direct_dor_list(obj: Any) -> List[Any]:
-    """
-    Search all sub attribute of type "DataObjectreference".
-    :param obj:
-    :return:
-    """
-    return search_attribute_matching_type(obj, "DataObjectreference")
-
-
-
-def get_matching_class_attribute_name(cls: Union[type, Any], attribute_name: str, re_flags=re.IGNORECASE) ‑> Optional[str] -
-
-

From an object and an attribute name, returns the correct attribute name of the class. -Example : "ObjectVersion" –> object_version. -This method doesn't only transform to snake case but search into the obj class attributes

-
- -Expand source code - -
def get_matching_class_attribute_name(
-        cls: Union[type, Any], attribute_name: str, re_flags=re.IGNORECASE,
-) -> Optional[str]:
-    """
-    From an object and an attribute name, returns the correct attribute name of the class.
-    Example : "ObjectVersion" --> object_version.
-    This method doesn't only transform to snake case but search into the obj class attributes
-    """
-    class_fields = get_class_fields(cls)
-
-    # a search with the exact value
-    for name, cf in class_fields.items():
-        if (
-                snake_case(name) == snake_case(attribute_name)
-                or ('name' in cf.metadata and cf.metadata['name'] == attribute_name)
-        ):
-            return name
-
-    # search regex after to avoid shadowing perfect match
-    pattern = re.compile(attribute_name, flags=re_flags)
-    for name, cf in class_fields.items():
-        # print(f"\t->{name} : {attribute_name} {pattern.match(name)} {('name' in cf.metadata and pattern.match(cf.metadata['name']))}")
-        if pattern.match(name) or ('name' in cf.metadata and pattern.match(cf.metadata['name'])):
-            return name
-
-    return None
-
-
-
-def get_obj_type(obj: Any) ‑> str -
-
-

Return the type name of an object. If obj is already a :class:type, return its name

-
- -Expand source code - -
def get_obj_type(obj: Any) -> str:
-    """ Return the type name of an object. If obj is already a :class:`type`, return its __name__"""
-    if isinstance(obj, type):
-        return str(obj.__name__)
-    return get_obj_type(type(obj))
-
-
-
-def get_obj_uuid(obj: Any) ‑> str -
-
-

Return the object uuid (attribute must match the following regex : "[Uu]u?id|UUID"). -:param obj: -:return:

-
- -Expand source code - -
def get_obj_uuid(obj: Any) -> str:
-    """
-    Return the object uuid (attribute must match the following regex : "[Uu]u?id|UUID").
-    :param obj:
-    :return:
-    """
-    return get_object_attribute_rgx(obj, "[Uu]u?id|UUID")
-
-
-
-def get_obj_version(obj: Any) ‑> str -
-
-

Return the object version (check for "object_version" or "version_string" attribute). -:param obj: -:return:

-
- -Expand source code - -
def get_obj_version(obj: Any) -> str:
-    """
-    Return the object version (check for "object_version" or "version_string" attribute).
-    :param obj:
-    :return:
-    """
-    try:
-        return get_object_attribute_no_verif(obj, "object_version")
-    except AttributeError as e:
-        try:
-            return get_object_attribute_no_verif(obj, "version_string")
-        except Exception:
-            print(f"Error with {type(obj)}")
-            raise e
-
-
-
-def get_object_attribute(obj: Any, attr_dot_path: str, force_snake_case=True) ‑> Any -
-
-

returns the value of an attribute given by a dot representation of its path in the object -example "Citation.Title"

-
- -Expand source code - -
def get_object_attribute(
-        obj: Any, attr_dot_path: str, force_snake_case=True
-) -> Any:
-    """
-    returns the value of an attribute given by a dot representation of its path in the object
-    example "Citation.Title"
-    """
-    while attr_dot_path.startswith("."):  # avoid '.Citation.Title' to take an empty attribute name before the first '.'
-        attr_dot_path = attr_dot_path[1:]
-
-    current_attrib_name = attr_dot_path
-
-    if "." in attr_dot_path:
-        current_attrib_name = attr_dot_path.split(".")[0]
-
-    if force_snake_case:
-        current_attrib_name = snake_case(current_attrib_name)
-
-    value = None
-    if isinstance(obj, list):
-        value = obj[int(current_attrib_name)]
-    elif isinstance(obj, dict):
-        value = obj[current_attrib_name]
-    else:
-        value = getattr(obj, current_attrib_name)
-
-    if "." in attr_dot_path:
-        return get_object_attribute(
-            value, attr_dot_path[len(current_attrib_name) + 1:]
-        )
-    else:
-        return value
-
-
-
-def get_object_attribute_advanced(obj: Any, attr_dot_path: str) ‑> Any -
-
-

see @get_matching_class_attribute_name and @get_object_attribute

-
- -Expand source code - -
def get_object_attribute_advanced(obj: Any, attr_dot_path: str) -> Any:
-    """
-    see @get_matching_class_attribute_name and @get_object_attribute
-    """
-    current_attrib_name = attr_dot_path
-
-    if "." in attr_dot_path:
-        current_attrib_name = attr_dot_path.split(".")[0]
-
-    current_attrib_name = get_matching_class_attribute_name(
-        obj, current_attrib_name
-    )
-
-    value = None
-    if isinstance(obj, list):
-        value = obj[int(current_attrib_name)]
-    elif isinstance(obj, dict):
-        value = obj[current_attrib_name]
-    else:
-        value = getattr(obj, current_attrib_name)
-
-    if "." in attr_dot_path:
-        return get_object_attribute_advanced(
-            value, attr_dot_path[len(current_attrib_name) + 1:]
-        )
-    else:
-        return value
-
-
-
-def get_object_attribute_no_verif(obj: Any, attr_name: str) ‑> Any -
-
-

Return the value of the attribute named after param :param:attr_name without verification (may raise an exception -if it doesn't exists).

-

Note: attr_name="0" will work if :param:obj is of type :class:List -:param obj: -:param attr_name: -:return:

-
- -Expand source code - -
def get_object_attribute_no_verif(obj: Any, attr_name: str) -> Any:
-    """
-    Return the value of the attribute named after param :param:`attr_name` without verification (may raise an exception
-    if it doesn't exists).
-
-    Note: attr_name="0" will work if :param:`obj` is of type :class:`List`
-    :param obj:
-    :param attr_name:
-    :return:
-    """
-    if isinstance(obj, list):
-        return obj[int(attr_name)]
-    elif isinstance(obj, dict):
-        return obj[attr_name]
-    else:
-        return getattr(obj, attr_name)
-
-
-
-def get_object_attribute_rgx(obj: Any, attr_dot_path_rgx: str) ‑> Any -
-
-

see @get_object_attribute. Search the attribute name using regex for values between dots. -Example : [Cc]itation.[Tt]it.*

-
- -Expand source code - -
def get_object_attribute_rgx(obj: Any, attr_dot_path_rgx: str) -> Any:
-    """
-    see @get_object_attribute. Search the attribute name using regex for values between dots.
-    Example : [Cc]itation.[Tt]it\\.*
-    """
-    current_attrib_name = attr_dot_path_rgx
-
-    attrib_list = re.split(r"(?<!\\)\.+", attr_dot_path_rgx)
-
-    if len(attrib_list) > 0:
-        current_attrib_name = attrib_list[0]
-
-    # unescape Dot
-    current_attrib_name = current_attrib_name.replace("\\.", ".")
-
-    real_attrib_name = get_matching_class_attribute_name(
-        obj, current_attrib_name
-    )
-    if real_attrib_name is not None:
-        value = get_object_attribute_no_verif(obj, real_attrib_name)
-
-        if len(attrib_list) > 1:
-            return get_object_attribute_rgx(
-                value, attr_dot_path_rgx[len(current_attrib_name) + 1:]
-            )
-        else:
-            return value
-    return None
-
-
-
-def get_object_type_for_file_path_from_class(cls) ‑> str -
-
-
-
- -Expand source code - -
def get_object_type_for_file_path_from_class(cls) -> str:
-    # obj_type = get_obj_type(cls)
-    # pkg = get_class_pkg(cls)
-    # if re.match(r"Obj[A-Z].*", obj_type) is not None and pkg == "resqml":
-    #     return "obj_" + obj_type[3:]
-    # return obj_type
-
-    try:
-        return cls.Meta.name  # to work with 3d transformed in 3D and Obj[A-Z] in obj_[A-Z]
-    except AttributeError:
-        pkg = get_class_pkg(cls)
-        return get_obj_type(cls)
-
-
-
-def get_qualified_type_from_class(cls: Union[type, Any], print_dev_version=True) -
-
-
-
- -Expand source code - -
def get_qualified_type_from_class(cls: Union[type, Any], print_dev_version=True):
-    return (
-            get_data_object_type(cls, print_dev_version, 2)
-            .replace(".", "") + "." + get_object_type_for_file_path_from_class(cls)
-    )
-
-
- -
-

Import related modules for a specific energyml module. (See. :const:RELATED_MODULES) -:param energyml_module_name: -:return:

-
- -Expand source code - -
def import_related_module(energyml_module_name: str) -> None:
-    """
-    Import related modules for a specific energyml module. (See. :const:`RELATED_MODULES`)
-    :param energyml_module_name:
-    :return:
-    """
-    for related in RELATED_MODULES:
-        if energyml_module_name in related:
-            for m in related:
-                try:
-                    import_module(m)
-                except Exception as e:
-                    pass
-                    # print(e)
-
-
-
-def is_abstract(cls: Union[type, Any]) ‑> bool -
-
-

Returns True if :param:cls is an abstract class -:param cls: -:return: bool

-
- -Expand source code - -
def is_abstract(cls: Union[type, Any]) -> bool:
-    """
-    Returns True if :param:`cls` is an abstract class
-    :param cls:
-    :return: bool
-    """
-    if isinstance(cls, type):
-        return not is_primitive(cls) and (cls.__name__.startswith("Abstract") or (hasattr(cls, "__dataclass_fields__") and len(cls.__dataclass_fields__)) == 0) and len(get_class_methods(cls)) == 0
-    return is_abstract(type(cls))
-
-
-
-def is_enum(cls: Union[type, Any]) -
-
-

Returns True if :param:cls is an Enum -:param cls: -:return:

-
- -Expand source code - -
def is_enum(cls: Union[type, Any]):
-    """
-    Returns True if :param:`cls` is an Enum
-    :param cls:
-    :return:
-    """
-    if isinstance(cls, type):
-        return Enum in cls.__bases__
-    return is_enum(type(cls))
-
-
-
-def is_primitive(cls: Union[type, Any]) ‑> bool -
-
-

Returns True if :param:cls is a primitiv type or extends Enum -:param cls: -:return: bool

-
- -Expand source code - -
def is_primitive(cls: Union[type, Any]) -> bool:
-    """
-    Returns True if :param:`cls` is a primitiv type or extends Enum
-    :param cls:
-    :return: bool
-    """
-    if isinstance(cls, type):
-        return cls in primitives or Enum in cls.__bases__
-    return is_primitive(type(cls))
-
-
-
-def now(time_zone=datetime.timezone(datetime.timedelta(seconds=3600), 'UTC')) ‑> int -
-
-

Return an epoch value

-
- -Expand source code - -
def now(time_zone=datetime.timezone(datetime.timedelta(hours=1), "UTC")) -> int:
-    """ Return an epoch value """
-    return int(datetime.datetime.timestamp(datetime.datetime.now(time_zone)))
-
-
-
-def pascal_case(s: str) ‑> str -
-
-

Transform a str into pascal case.

-
- -Expand source code - -
def pascal_case(s: str) -> str:
-    """ Transform a str into pascal case. """
-    return snake_case(s).replace("_", " ").title().replace(" ", "")
-
-
-
-def random_value_from_class(cls: type) -
-
-

Generate a random value for a :class:type. All attributes should be filled with random values. -:param cls: -:return:

-
- -Expand source code - -
def random_value_from_class(cls: type):
-    """
-    Generate a random value for a :class:`type`. All attributes should be filled with random values.
-    :param cls:
-    :return:
-    """
-    energyml_module_context = []
-    if not is_primitive(cls):
-        # import_related_module(cls.__module__)
-        energyml_module_context = get_related_energyml_modules_name(cls)
-    return _random_value_from_class(cls=cls, energyml_module_context=energyml_module_context, attribute_name=None)
-
-
-
-def search_attribute_in_upper_matching_name(obj: Any, name_rgx: str, root_obj: Optional[Any] = None, re_flags=re.IGNORECASE, current_path: str = '') ‑> Optional[Any] -
-
-

See :func:search_attribute_matching_type_with_path(). It only returns the value not the path -:param obj: -:param name_rgx: -:param root_obj: -:param re_flags: -:param current_path: -:return:

-
- -Expand source code - -
def search_attribute_in_upper_matching_name(
-        obj: Any,
-        name_rgx: str,
-        root_obj: Optional[Any] = None,
-        re_flags=re.IGNORECASE,
-        current_path: str = "",
-) -> Optional[Any]:
-    """
-    See :func:`search_attribute_matching_type_with_path`. It only returns the value not the path
-    :param obj:
-    :param name_rgx:
-    :param root_obj:
-    :param re_flags:
-    :param current_path:
-    :return:
-    """
-    elt_list = search_attribute_matching_name(obj, name_rgx, search_in_sub_obj=False, deep_search=False)
-    if elt_list is not None and len(elt_list) > 0:
-        return elt_list
-
-    if obj != root_obj:
-        upper_path = current_path[:current_path.rindex(".")]
-        if len(upper_path) > 0:
-            return search_attribute_in_upper_matching_name(
-                obj=get_object_attribute(root_obj, upper_path),
-                name_rgx=name_rgx,
-                root_obj=root_obj,
-                re_flags=re_flags,
-                current_path=upper_path,
-            )
-
-    return None
-
-
-
-def search_attribute_matching_name(obj: Any, name_rgx: str, re_flags=re.IGNORECASE, deep_search: bool = True, search_in_sub_obj: bool = True) ‑> List[Any] -
-
-

See :func:search_attribute_matching_name_with_path(). It only returns the value not the path

-

:param obj: -:param name_rgx: -:param re_flags: -:param deep_search: -:param search_in_sub_obj: -:return:

-
- -Expand source code - -
def search_attribute_matching_name(
-        obj: Any,
-        name_rgx: str,
-        re_flags=re.IGNORECASE,
-        deep_search: bool = True,  # Search inside a matching object
-        search_in_sub_obj: bool = True,  # Search in obj attributes
-) -> List[Any]:
-    """
-    See :func:`search_attribute_matching_name_with_path`. It only returns the value not the path
-
-    :param obj:
-    :param name_rgx:
-    :param re_flags:
-    :param deep_search:
-    :param search_in_sub_obj:
-    :return:
-    """
-    return [
-        val
-        for path, val in search_attribute_matching_name_with_path(
-            obj=obj,
-            name_rgx=name_rgx,
-            re_flags=re_flags,
-            deep_search=deep_search,
-            search_in_sub_obj=search_in_sub_obj
-        )
-    ]
-
-
-
-def search_attribute_matching_name_with_path(obj: Any, name_rgx: str, re_flags=re.IGNORECASE, current_path: str = '', deep_search: bool = True, search_in_sub_obj: bool = True) ‑> List[Tuple[str, Any]] -
-
-

Returns a list of tuple (path, value) for each sub attribute with type matching param "name_rgx". -The path is a dot-version like ".Citation.Title" -:param obj: -:param name_rgx: -:param re_flags: -:param current_path: -:param deep_search: -:param search_in_sub_obj: -:return:

-
- -Expand source code - -
def search_attribute_matching_name_with_path(
-        obj: Any,
-        name_rgx: str,
-        re_flags=re.IGNORECASE,
-        current_path: str = "",
-        deep_search: bool = True,  # Search inside a matching object
-        search_in_sub_obj: bool = True,  # Search in obj attributes
-) -> List[Tuple[str, Any]]:
-    """
-    Returns a list of tuple (path, value) for each sub attribute with type matching param "name_rgx".
-    The path is a dot-version like ".Citation.Title"
-    :param obj:
-    :param name_rgx:
-    :param re_flags:
-    :param current_path:
-    :param deep_search:
-    :param search_in_sub_obj:
-    :return:
-    """
-    while name_rgx.startswith("."):
-        name_rgx = name_rgx[1:]
-    current_match = name_rgx
-    next_match = current_match
-    if '.' in current_match:
-        attrib_list = re.split(r"(?<!\\)\.+", name_rgx)
-        current_match = attrib_list[0]
-        next_match = '.'.join(attrib_list[1:])
-
-    res = []
-
-    match_value = None
-    match_path_and_obj = []
-    not_match_path_and_obj = []
-    if isinstance(obj, list):
-        cpt = 0
-        for s_o in obj:
-            match = re.match(current_match.replace("\\.", "."), str(cpt), flags=re_flags)
-            if match is not None:
-                match_value = match.group(0)
-                match_path_and_obj.append( (f"{current_path}.{cpt}", s_o) )
-            else:
-                not_match_path_and_obj.append( (f"{current_path}.{cpt}", s_o) )
-            cpt = cpt + 1
-    elif isinstance(obj, dict):
-        for k, s_o in obj.items():
-            match = re.match(current_match.replace("\\.", "."), k, flags=re_flags)
-            if match is not None:
-                match_value = match.group(0)
-                match_path_and_obj.append( (f"{current_path}.{k}", s_o) )
-            else:
-                not_match_path_and_obj.append( (f"{current_path}.{k}", s_o) )
-    elif not is_primitive(obj):
-        match_value = get_matching_class_attribute_name(obj, current_match.replace("\\.", "."))
-        if match_value is not None:
-            match_path_and_obj.append( (f"{current_path}.{match_value}", get_object_attribute_no_verif(obj, match_value)) )
-        for att_name in get_class_attributes(obj):
-            if att_name != match_value:
-                not_match_path_and_obj.append( (f"{current_path}.{att_name}", get_object_attribute_no_verif(obj, att_name)) )
-
-    for matched_path, matched in match_path_and_obj:
-        if next_match != current_match and len(next_match) > 0:  # next_match is different, match is not final
-            res = res + search_attribute_matching_name_with_path(
-                obj=matched,
-                name_rgx=next_match,
-                re_flags=re_flags,
-                current_path=matched_path,
-                deep_search=False,  # no deep with partial
-                search_in_sub_obj=False,  # no partial search in sub obj with no match
-            )
-        else:  # a complete match
-            res.append( (matched_path, matched) )
-            if deep_search:
-                res = res + search_attribute_matching_name_with_path(
-                    obj=matched,
-                    name_rgx=name_rgx,
-                    re_flags=re_flags,
-                    current_path=matched_path,
-                    deep_search=deep_search,  # no deep with partial
-                    search_in_sub_obj=True,
-                )
-    if search_in_sub_obj:
-        for not_matched_path, not_matched in not_match_path_and_obj:
-            res = res + search_attribute_matching_name_with_path(
-                obj=not_matched,
-                name_rgx=name_rgx,
-                re_flags=re_flags,
-                current_path=not_matched_path,
-                deep_search=deep_search,
-                search_in_sub_obj=True,
-            )
-
-    return res
-
-
-
-def search_attribute_matching_type(obj: Any, type_rgx: str, re_flags=re.IGNORECASE, return_self: bool = True, deep_search: bool = True, super_class_search: bool = True) ‑> List[Any] -
-
-

See :func:search_attribute_matching_type_with_path(). It only returns the value not the path -:param obj: -:param type_rgx: -:param re_flags: -:param return_self: -:param deep_search: -:param super_class_search: -:return:

-
- -Expand source code - -
def search_attribute_matching_type(
-        obj: Any,
-        type_rgx: str,
-        re_flags=re.IGNORECASE,
-        return_self: bool = True,  # test directly on input object and not only in its attributes
-        deep_search: bool = True,  # Search inside a matching object
-        super_class_search: bool = True,  # Search inside in super classes of the object
-) -> List[Any]:
-    """
-    See :func:`search_attribute_matching_type_with_path`. It only returns the value not the path
-    :param obj:
-    :param type_rgx:
-    :param re_flags:
-    :param return_self:
-    :param deep_search:
-    :param super_class_search:
-    :return:
-    """
-    return [
-        val
-        for path, val in search_attribute_matching_type_with_path(
-            obj=obj,
-            type_rgx=type_rgx,
-            re_flags=re_flags,
-            return_self=return_self,
-            deep_search=deep_search,
-            super_class_search=super_class_search,
-        )
-    ]
-
-
-
-def search_attribute_matching_type_with_path(obj: Any, type_rgx: str, re_flags=re.IGNORECASE, return_self: bool = True, deep_search: bool = True, super_class_search: bool = True, current_path: str = '') ‑> List[Tuple[str, Any]] -
-
-

Returns a list of tuple (path, value) for each sub attribute with type matching param "type_rgx". -The path is a dot-version like ".Citation.Title" -:param obj: -:param type_rgx: -:param re_flags: -:param return_self: -:param deep_search: -:param super_class_search: -:param current_path: -:return:

-
- -Expand source code - -
def search_attribute_matching_type_with_path(
-        obj: Any,
-        type_rgx: str,
-        re_flags=re.IGNORECASE,
-        return_self: bool = True,  # test directly on input object and not only in its attributes
-        deep_search: bool = True,  # Search inside a matching object
-        super_class_search: bool = True,  # Search inside in super classes of the object
-        current_path: str = "",
-) -> List[Tuple[str, Any]]:
-    """
-    Returns a list of tuple (path, value) for each sub attribute with type matching param "type_rgx".
-    The path is a dot-version like ".Citation.Title"
-    :param obj:
-    :param type_rgx:
-    :param re_flags:
-    :param return_self:
-    :param deep_search:
-    :param super_class_search:
-    :param current_path:
-    :return:
-    """
-    res = []
-    if obj is not None:
-        if return_self and class_match_rgx(
-                obj, type_rgx, super_class_search, re_flags
-        ):
-            res.append((current_path, obj))
-            if not deep_search:
-                return res
-
-    if isinstance(obj, list):
-        cpt = 0
-        for s_o in obj:
-            res = res + search_attribute_matching_type_with_path(
-                obj=s_o,
-                type_rgx=type_rgx,
-                re_flags=re_flags,
-                return_self=True,
-                deep_search=deep_search,
-                current_path=f"{current_path}.{cpt}",
-                super_class_search=super_class_search,
-            )
-            cpt = cpt + 1
-    elif isinstance(obj, dict):
-        for k, s_o in obj.items():
-            res = res + search_attribute_matching_type_with_path(
-                obj=s_o,
-                type_rgx=type_rgx,
-                re_flags=re_flags,
-                return_self=True,
-                deep_search=deep_search,
-                current_path=f"{current_path}.{k}",
-                super_class_search=super_class_search,
-            )
-    elif not is_primitive(obj):
-        for att_name in get_class_attributes(obj):
-            res = res + search_attribute_matching_type_with_path(
-                obj=get_object_attribute_rgx(obj, att_name),
-                type_rgx=type_rgx,
-                re_flags=re_flags,
-                return_self=True,
-                deep_search=deep_search,
-                current_path=f"{current_path}.{att_name}",
-                super_class_search=super_class_search,
-            )
-
-    return res
-
-
-
-def snake_case(s: str) ‑> str -
-
-

Transform a str into snake case.

-
- -Expand source code - -
def snake_case(s: str) -> str:
-    """ Transform a str into snake case. """
-    s = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', s)
-    s = re.sub('__([A-Z])', r'_\1', s)
-    s = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s)
-    return s.lower()
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/energyml-utils/docs/src/energyml/utils/manager.html b/energyml-utils/docs/src/energyml/utils/manager.html deleted file mode 100644 index 6dbad1a..0000000 --- a/energyml-utils/docs/src/energyml/utils/manager.html +++ /dev/null @@ -1,615 +0,0 @@ - - - - - - -src.energyml.utils.manager API documentation - - - - - - - - - - - -
-
-
-

Module src.energyml.utils.manager

-
-
-
- -Expand source code - -
# Copyright (c) 2023-2024 Geosiris.
-# SPDX-License-Identifier: Apache-2.0
-import importlib
-import inspect
-import pkgutil
-import re
-from typing import List, Union, Any
-
-RGX_ENERGYML_MODULE_NAME = r"energyml\.(?P<pkg>.*)\.v(?P<version>(?P<versionNumber>\d+(_\d+)*)(_dev(?P<versionDev>.*))?)\..*"
-RGX_PROJECT_VERSION = r"(?P<n0>[\d]+)(.(?P<n1>[\d]+)(.(?P<n2>[\d]+))?)?"
-
-ENERGYML_MODULES_NAMES = ["eml", "prodml", "witsml", "resqml"]
-
-RELATED_MODULES = [
-    ["energyml.eml.v2_0.commonv2", "energyml.resqml.v2_0_1.resqmlv2"],
-    [
-        "energyml.eml.v2_1.commonv2",
-        "energyml.prodml.v2_0.prodmlv2",
-        "energyml.witsml.v2_0.witsmlv2",
-    ],
-    ["energyml.eml.v2_2.commonv2", "energyml.resqml.v2_2_dev3.resqmlv2"],
-    [
-        "energyml.eml.v2_3.commonv2",
-        "energyml.resqml.v2_2.resqmlv2",
-        "energyml.prodml.v2_2.prodmlv2",
-        "energyml.witsml.v2_1.witsmlv2",
-    ],
-]
-
-
-def get_related_energyml_modules_name(cls: Union[type, Any]) -> List[str]:
-    """
-    Return the list of all energyml modules related to another one.
-    For example resqml 2.0.1 is related to common 2.0
-    :param cls:
-    :return:
-    """
-    if isinstance(cls, type):
-        for related in RELATED_MODULES:
-            if cls.__module__ in related:
-                return related
-    else:
-        return get_related_energyml_modules_name(type(cls))
-    return []
-
-
-def dict_energyml_modules() -> List:
-    """
-    List all accessible energyml python modules
-    :return:
-    """
-    modules = {}
-
-    energyml_module = importlib.import_module("energyml")
-    # print("> energyml")
-
-    for mod in pkgutil.iter_modules(energyml_module.__path__):
-        # print(f"{mod.name}")
-        if mod.name in ENERGYML_MODULES_NAMES:
-            energyml_sub_module = importlib.import_module(
-                f"energyml.{mod.name}"
-            )
-            if mod.name not in modules:
-                modules[mod.name] = []
-            for sub_mod in pkgutil.iter_modules(energyml_sub_module.__path__):
-                modules[mod.name].append(sub_mod.name)
-                # modules[mod.name].append(re.sub(r"^\D*(?P<number>\d+(.\d+)*$)",
-                # r"\g<number>", sub_mod.name).replace("_", "."))
-    return modules
-
-
-def list_energyml_modules():
-    try:
-        energyml_module = importlib.import_module("energyml")
-        modules = []
-        for obj in pkgutil.iter_modules(energyml_module.__path__):
-            # print(f"{obj.name}")
-            if obj.name in ENERGYML_MODULES_NAMES:
-                modules.append(obj.name)
-        return modules
-    except ModuleNotFoundError:
-        return []
-
-
-def list_classes(module_path: str) -> List:
-    """
-    List all accessible classes from a specific module
-    :param module_path:
-    :return:
-    """
-    try:
-        module = importlib.import_module(module_path)
-        class_list = []
-        for _, obj in inspect.getmembers(module):
-            if inspect.isclass(obj):
-                class_list.append(obj)
-        return class_list
-    except ModuleNotFoundError:
-        print(f"Err : module {module_path} not found")
-        return []
-
-
-def get_sub_classes(cls: type) -> List[type]:
-    """
-    Return all classes that extends the class :param:`cls`.
-    :param cls:
-    :return:
-    """
-    sub_classes = []
-    for related in get_related_energyml_modules_name(cls):
-        try:
-            module = importlib.import_module(related)
-            for _, obj in inspect.getmembers(module):
-                if inspect.isclass(obj) and cls in obj.__bases__:
-                    sub_classes.append(obj)
-                    sub_classes = sub_classes + get_sub_classes(obj)
-        except ModuleNotFoundError:
-            pass
-    return list(dict.fromkeys(sub_classes))
-
-
-def get_classes_matching_name(cls: type, name_rgx: str, re_flags=re.IGNORECASE,) -> List[type]:
-    """
-    Search a class matching the regex @re_flags. The search is the energyml packages related to the objet type @cls.
-    :param cls:
-    :param name_rgx:
-    :param re_flags:
-    :return:
-    """
-    match_classes = []
-    for related in get_related_energyml_modules_name(cls):
-        try:
-            module = importlib.import_module(related)
-            for _, obj in inspect.getmembers(module):
-                if inspect.isclass(obj) and re.match(name_rgx, obj.__name__, re_flags):
-                    match_classes.append(obj)
-        except ModuleNotFoundError:
-            pass
-    return list(dict.fromkeys(match_classes))
-
-
-def get_all_energyml_classes() -> dict:
-    result = {}
-    for mod_name, versions in dict_energyml_modules().items():
-        for version in versions:
-            result = result | get_all_classes(mod_name, version)
-    return result
-
-
-def get_all_classes(module_name: str, version: str) -> dict:
-    result = {}
-    pkg_path = f"energyml.{module_name}.{version}"
-    package = importlib.import_module(pkg_path)
-    for _, modname, _ in pkgutil.walk_packages(
-        path=getattr(package, "__path__"),
-        prefix=package.__name__ + ".",
-        onerror=lambda x: None,
-    ):
-        result[pkg_path] = []
-        for classFound in list_classes(modname):
-            try:
-                result[pkg_path].append(classFound)
-            except Exception:
-                pass
-
-    return result
-
-
-def get_class_pkg(cls):
-    try:
-        p = re.compile(RGX_ENERGYML_MODULE_NAME)
-        m = p.search(cls.__module__)
-        return m.group("pkg")
-    except AttributeError as e:
-        print(f"Exception to get class package for '{cls}'")
-        raise e
-
-
-def reshape_version(version: str, nb_digit: int) -> str:
-    """
-    Reshape a project version to have only specific number of digits. If 0 < nbDigit < 4 then the reshape is done,
-    else, the original version is returned.
-    Example : reshapeVersion("v2.0.1", 2) ==> "2.0" and reshapeVersion("version2.0.1.3.2.5", 4) ==> "version2.0.1.3.2.5"
-    """
-    p = re.compile(RGX_PROJECT_VERSION)
-    m = p.search(version)
-    if m is not None:
-        n0 = m.group("n0")
-        n1 = m.group("n1")
-        n2 = m.group("n2")
-        if nb_digit == 1:
-            return n0
-        elif nb_digit == 2:
-            return n0 + ("." + n1 if n1 is not None else "")
-        elif nb_digit == 3:
-            return n0 + (
-                "." + n1 + ("." + n2 if n2 is not None else "")
-                if n1 is not None
-                else ""
-            )
-
-    return version
-
-
-def get_class_pkg_version(
-    cls, print_dev_version: bool = True, nb_max_version_digits: int = 2
-):
-    p = re.compile(RGX_ENERGYML_MODULE_NAME)
-    m = p.search(
-        cls.__module__ if isinstance(cls, type) else type(cls).__module__
-    )
-    return reshape_version(m.group("versionNumber"), nb_max_version_digits) + (
-        m.group("versionDev")
-        if m.group("versionDev") is not None and print_dev_version
-        else ""
-    )
-
-
-# ProtocolDict = DefaultDict[str, MessageDict]
-# def get_all__classes() -> ProtocolDict:
-#     protocolDict: ProtocolDict = defaultdict(
-#         lambda: defaultdict(type(ETPModel))
-#     )
-#     package = energyml
-#     for _, modname, _ in pkgutil.walk_packages(
-#         path=getattr(package, "__path__"),
-#         prefix=package.__name__ + ".",
-#         onerror=lambda x: None,
-#     ):
-#         for classFound in list_classes(modname):
-#             try:
-#                 schem = json.loads(avro_schema(classFound))
-#                 protocolId = schem["protocol"]
-#                 messageType = schem["messageType"]
-#                 protocolDict[protocolId][messageType] = classFound
-#             except Exception:
-#                 pass
-#     return protocolDict
-
-
-
-
-
-
-
-

Functions

-
-
-def dict_energyml_modules() ‑> List -
-
-

List all accessible energyml python modules -:return:

-
- -Expand source code - -
def dict_energyml_modules() -> List:
-    """
-    List all accessible energyml python modules
-    :return:
-    """
-    modules = {}
-
-    energyml_module = importlib.import_module("energyml")
-    # print("> energyml")
-
-    for mod in pkgutil.iter_modules(energyml_module.__path__):
-        # print(f"{mod.name}")
-        if mod.name in ENERGYML_MODULES_NAMES:
-            energyml_sub_module = importlib.import_module(
-                f"energyml.{mod.name}"
-            )
-            if mod.name not in modules:
-                modules[mod.name] = []
-            for sub_mod in pkgutil.iter_modules(energyml_sub_module.__path__):
-                modules[mod.name].append(sub_mod.name)
-                # modules[mod.name].append(re.sub(r"^\D*(?P<number>\d+(.\d+)*$)",
-                # r"\g<number>", sub_mod.name).replace("_", "."))
-    return modules
-
-
-
-def get_all_classes(module_name: str, version: str) ‑> dict -
-
-
-
- -Expand source code - -
def get_all_classes(module_name: str, version: str) -> dict:
-    result = {}
-    pkg_path = f"energyml.{module_name}.{version}"
-    package = importlib.import_module(pkg_path)
-    for _, modname, _ in pkgutil.walk_packages(
-        path=getattr(package, "__path__"),
-        prefix=package.__name__ + ".",
-        onerror=lambda x: None,
-    ):
-        result[pkg_path] = []
-        for classFound in list_classes(modname):
-            try:
-                result[pkg_path].append(classFound)
-            except Exception:
-                pass
-
-    return result
-
-
-
-def get_all_energyml_classes() ‑> dict -
-
-
-
- -Expand source code - -
def get_all_energyml_classes() -> dict:
-    result = {}
-    for mod_name, versions in dict_energyml_modules().items():
-        for version in versions:
-            result = result | get_all_classes(mod_name, version)
-    return result
-
-
-
-def get_class_pkg(cls) -
-
-
-
- -Expand source code - -
def get_class_pkg(cls):
-    try:
-        p = re.compile(RGX_ENERGYML_MODULE_NAME)
-        m = p.search(cls.__module__)
-        return m.group("pkg")
-    except AttributeError as e:
-        print(f"Exception to get class package for '{cls}'")
-        raise e
-
-
-
-def get_class_pkg_version(cls, print_dev_version: bool = True, nb_max_version_digits: int = 2) -
-
-
-
- -Expand source code - -
def get_class_pkg_version(
-    cls, print_dev_version: bool = True, nb_max_version_digits: int = 2
-):
-    p = re.compile(RGX_ENERGYML_MODULE_NAME)
-    m = p.search(
-        cls.__module__ if isinstance(cls, type) else type(cls).__module__
-    )
-    return reshape_version(m.group("versionNumber"), nb_max_version_digits) + (
-        m.group("versionDev")
-        if m.group("versionDev") is not None and print_dev_version
-        else ""
-    )
-
-
-
-def get_classes_matching_name(cls: type, name_rgx: str, re_flags=re.IGNORECASE) ‑> List[type] -
-
-

Search a class matching the regex @re_flags. The search is the energyml packages related to the objet type @cls. -:param cls: -:param name_rgx: -:param re_flags: -:return:

-
- -Expand source code - -
def get_classes_matching_name(cls: type, name_rgx: str, re_flags=re.IGNORECASE,) -> List[type]:
-    """
-    Search a class matching the regex @re_flags. The search is the energyml packages related to the objet type @cls.
-    :param cls:
-    :param name_rgx:
-    :param re_flags:
-    :return:
-    """
-    match_classes = []
-    for related in get_related_energyml_modules_name(cls):
-        try:
-            module = importlib.import_module(related)
-            for _, obj in inspect.getmembers(module):
-                if inspect.isclass(obj) and re.match(name_rgx, obj.__name__, re_flags):
-                    match_classes.append(obj)
-        except ModuleNotFoundError:
-            pass
-    return list(dict.fromkeys(match_classes))
-
-
- -
-

Return the list of all energyml modules related to another one. -For example resqml 2.0.1 is related to common 2.0 -:param cls: -:return:

-
- -Expand source code - -
def get_related_energyml_modules_name(cls: Union[type, Any]) -> List[str]:
-    """
-    Return the list of all energyml modules related to another one.
-    For example resqml 2.0.1 is related to common 2.0
-    :param cls:
-    :return:
-    """
-    if isinstance(cls, type):
-        for related in RELATED_MODULES:
-            if cls.__module__ in related:
-                return related
-    else:
-        return get_related_energyml_modules_name(type(cls))
-    return []
-
-
-
-def get_sub_classes(cls: type) ‑> List[type] -
-
-

Return all classes that extends the class :param:cls. -:param cls: -:return:

-
- -Expand source code - -
def get_sub_classes(cls: type) -> List[type]:
-    """
-    Return all classes that extends the class :param:`cls`.
-    :param cls:
-    :return:
-    """
-    sub_classes = []
-    for related in get_related_energyml_modules_name(cls):
-        try:
-            module = importlib.import_module(related)
-            for _, obj in inspect.getmembers(module):
-                if inspect.isclass(obj) and cls in obj.__bases__:
-                    sub_classes.append(obj)
-                    sub_classes = sub_classes + get_sub_classes(obj)
-        except ModuleNotFoundError:
-            pass
-    return list(dict.fromkeys(sub_classes))
-
-
-
-def list_classes(module_path: str) ‑> List -
-
-

List all accessible classes from a specific module -:param module_path: -:return:

-
- -Expand source code - -
def list_classes(module_path: str) -> List:
-    """
-    List all accessible classes from a specific module
-    :param module_path:
-    :return:
-    """
-    try:
-        module = importlib.import_module(module_path)
-        class_list = []
-        for _, obj in inspect.getmembers(module):
-            if inspect.isclass(obj):
-                class_list.append(obj)
-        return class_list
-    except ModuleNotFoundError:
-        print(f"Err : module {module_path} not found")
-        return []
-
-
-
-def list_energyml_modules() -
-
-
-
- -Expand source code - -
def list_energyml_modules():
-    try:
-        energyml_module = importlib.import_module("energyml")
-        modules = []
-        for obj in pkgutil.iter_modules(energyml_module.__path__):
-            # print(f"{obj.name}")
-            if obj.name in ENERGYML_MODULES_NAMES:
-                modules.append(obj.name)
-        return modules
-    except ModuleNotFoundError:
-        return []
-
-
-
-def reshape_version(version: str, nb_digit: int) ‑> str -
-
-

Reshape a project version to have only specific number of digits. If 0 < nbDigit < 4 then the reshape is done, -else, the original version is returned. -Example : reshapeVersion("v2.0.1", 2) ==> "2.0" and reshapeVersion("version2.0.1.3.2.5", 4) ==> "version2.0.1.3.2.5"

-
- -Expand source code - -
def reshape_version(version: str, nb_digit: int) -> str:
-    """
-    Reshape a project version to have only specific number of digits. If 0 < nbDigit < 4 then the reshape is done,
-    else, the original version is returned.
-    Example : reshapeVersion("v2.0.1", 2) ==> "2.0" and reshapeVersion("version2.0.1.3.2.5", 4) ==> "version2.0.1.3.2.5"
-    """
-    p = re.compile(RGX_PROJECT_VERSION)
-    m = p.search(version)
-    if m is not None:
-        n0 = m.group("n0")
-        n1 = m.group("n1")
-        n2 = m.group("n2")
-        if nb_digit == 1:
-            return n0
-        elif nb_digit == 2:
-            return n0 + ("." + n1 if n1 is not None else "")
-        elif nb_digit == 3:
-            return n0 + (
-                "." + n1 + ("." + n2 if n2 is not None else "")
-                if n1 is not None
-                else ""
-            )
-
-    return version
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/energyml-utils/docs/src/energyml/utils/serialization.html b/energyml-utils/docs/src/energyml/utils/serialization.html deleted file mode 100644 index bad0235..0000000 --- a/energyml-utils/docs/src/energyml/utils/serialization.html +++ /dev/null @@ -1,305 +0,0 @@ - - - - - - -src.energyml.utils.serialization API documentation - - - - - - - - - - - -
-
-
-

Module src.energyml.utils.serialization

-
-
-
- -Expand source code - -
# Copyright (c) 2023-2024 Geosiris.
-# SPDX-License-Identifier: Apache-2.0
-from io import BytesIO
-from typing import Optional, Any
-
-import energyml
-from xsdata.exceptions import ParserError
-from xsdata.formats.dataclass.context import XmlContext
-from xsdata.formats.dataclass.parsers import XmlParser
-from xsdata.formats.dataclass.serializers import JsonSerializer
-from xsdata.formats.dataclass.serializers import XmlSerializer
-from xsdata.formats.dataclass.serializers.config import SerializerConfig
-
-from .introspection import get_class_from_name
-from .xml import get_class_name_from_xml, get_tree
-
-
-def read_energyml_xml_bytes_as_class(file: bytes, obj_class: type) -> Any:
-    """
-    Read an xml file into the instance of type :param:`obj_class`.
-    :param file:
-    :param obj_class:
-    :return:
-    """
-    parser = XmlParser()
-    try:
-        return parser.from_bytes(file, obj_class)
-    except ParserError as e:
-        print(f"Failed to parse file {file} as class {obj_class}")
-        raise e
-
-
-def read_energyml_xml_bytes(file: bytes) -> Any:
-    """
-    Read an xml file. The type of object is searched from the xml root name.
-    :param file:
-    :return:
-    """
-    return read_energyml_xml_bytes_as_class(
-        file, get_class_from_name(get_class_name_from_xml(get_tree(file)))
-    )
-
-
-def read_energyml_xml_io(file: BytesIO, obj_class: Optional[type] = None) -> Any:
-    if obj_class is not None:
-        return read_energyml_xml_bytes_as_class(file.getbuffer(), obj_class)
-    else:
-        return read_energyml_xml_bytes(file.getbuffer())
-
-
-def read_energyml_xml_str(file_content: str) -> Any:
-    parser = XmlParser()
-    # from energyml.resqml.v2_2.resqmlv2 import TriangulatedSetRepresentation
-    return parser.from_string(
-        file_content,
-        get_class_from_name(get_class_name_from_xml(get_tree(file_content))),
-    )  # , TriangulatedSetRepresentation)
-
-
-def read_energyml_xml_file(file_path: str) -> Any:
-    xml_content = ""
-    with open(file_path, "r") as f:
-        xml_content = f.read()
-    parser = XmlParser()
-    # from energyml.resqml.v2_2.resqmlv2 import TriangulatedSetRepresentation
-    # return parser.parse(file_path)  # , TriangulatedSetRepresentation)
-    return parser.parse(
-        file_path,
-        get_class_from_name(get_class_name_from_xml(get_tree(xml_content))),
-    )
-
-
-def serialize_xml(obj) -> str:
-    context = XmlContext(
-        # element_name_generator=text.camel_case,
-        # attribute_name_generator=text.kebab_case
-    )
-    serializer_config = SerializerConfig(indent="  ")
-    serializer = XmlSerializer(context=context, config=serializer_config)
-    return serializer.render(obj)
-
-
-def serialize_json(obj) -> str:
-    context = XmlContext(
-        # element_name_generator=text.camel_case,
-        # attribute_name_generator=text.kebab_case
-    )
-    serializer_config = SerializerConfig(indent="  ")
-    serializer = JsonSerializer(context=context, config=serializer_config)
-    return serializer.render(obj)
-
-
-
-
-
-
-
-

Functions

-
-
-def read_energyml_xml_bytes(file: bytes) ‑> Any -
-
-

Read an xml file. The type of object is searched from the xml root name. -:param file: -:return:

-
- -Expand source code - -
def read_energyml_xml_bytes(file: bytes) -> Any:
-    """
-    Read an xml file. The type of object is searched from the xml root name.
-    :param file:
-    :return:
-    """
-    return read_energyml_xml_bytes_as_class(
-        file, get_class_from_name(get_class_name_from_xml(get_tree(file)))
-    )
-
-
-
-def read_energyml_xml_bytes_as_class(file: bytes, obj_class: type) ‑> Any -
-
-

Read an xml file into the instance of type :param:obj_class. -:param file: -:param obj_class: -:return:

-
- -Expand source code - -
def read_energyml_xml_bytes_as_class(file: bytes, obj_class: type) -> Any:
-    """
-    Read an xml file into the instance of type :param:`obj_class`.
-    :param file:
-    :param obj_class:
-    :return:
-    """
-    parser = XmlParser()
-    try:
-        return parser.from_bytes(file, obj_class)
-    except ParserError as e:
-        print(f"Failed to parse file {file} as class {obj_class}")
-        raise e
-
-
-
-def read_energyml_xml_file(file_path: str) ‑> Any -
-
-
-
- -Expand source code - -
def read_energyml_xml_file(file_path: str) -> Any:
-    xml_content = ""
-    with open(file_path, "r") as f:
-        xml_content = f.read()
-    parser = XmlParser()
-    # from energyml.resqml.v2_2.resqmlv2 import TriangulatedSetRepresentation
-    # return parser.parse(file_path)  # , TriangulatedSetRepresentation)
-    return parser.parse(
-        file_path,
-        get_class_from_name(get_class_name_from_xml(get_tree(xml_content))),
-    )
-
-
-
-def read_energyml_xml_io(file: _io.BytesIO, obj_class: Optional[type] = None) ‑> Any -
-
-
-
- -Expand source code - -
def read_energyml_xml_io(file: BytesIO, obj_class: Optional[type] = None) -> Any:
-    if obj_class is not None:
-        return read_energyml_xml_bytes_as_class(file.getbuffer(), obj_class)
-    else:
-        return read_energyml_xml_bytes(file.getbuffer())
-
-
-
-def read_energyml_xml_str(file_content: str) ‑> Any -
-
-
-
- -Expand source code - -
def read_energyml_xml_str(file_content: str) -> Any:
-    parser = XmlParser()
-    # from energyml.resqml.v2_2.resqmlv2 import TriangulatedSetRepresentation
-    return parser.from_string(
-        file_content,
-        get_class_from_name(get_class_name_from_xml(get_tree(file_content))),
-    )  # , TriangulatedSetRepresentation)
-
-
-
-def serialize_json(obj) ‑> str -
-
-
-
- -Expand source code - -
def serialize_json(obj) -> str:
-    context = XmlContext(
-        # element_name_generator=text.camel_case,
-        # attribute_name_generator=text.kebab_case
-    )
-    serializer_config = SerializerConfig(indent="  ")
-    serializer = JsonSerializer(context=context, config=serializer_config)
-    return serializer.render(obj)
-
-
-
-def serialize_xml(obj) ‑> str -
-
-
-
- -Expand source code - -
def serialize_xml(obj) -> str:
-    context = XmlContext(
-        # element_name_generator=text.camel_case,
-        # attribute_name_generator=text.kebab_case
-    )
-    serializer_config = SerializerConfig(indent="  ")
-    serializer = XmlSerializer(context=context, config=serializer_config)
-    return serializer.render(obj)
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/energyml-utils/docs/src/energyml/utils/validation.html b/energyml-utils/docs/src/energyml/utils/validation.html deleted file mode 100644 index e5a5fca..0000000 --- a/energyml-utils/docs/src/energyml/utils/validation.html +++ /dev/null @@ -1,984 +0,0 @@ - - - - - - -src.energyml.utils.validation API documentation - - - - - - - - - - - -
-
-
-

Module src.energyml.utils.validation

-
-
-
- -Expand source code - -
# Copyright (c) 2023-2024 Geosiris.
-# SPDX-License-Identifier: Apache-2.0
-import re
-from dataclasses import dataclass, field, Field
-from enum import Enum
-from typing import Any, List
-
-from .epc import (
-    get_obj_identifier, Epc,
-)
-from .introspection import (
-    get_class_fields,
-    get_object_attribute,
-    search_attribute_matching_type_with_path,
-    get_object_attribute_no_verif,
-    get_object_attribute_rgx,
-    get_matching_class_attribute_name, get_obj_uuid, get_obj_version, get_content_type_from_class,
-    get_qualified_type_from_class,
-)
-
-
-class ErrorType(Enum):
-    CRITICAL = "critical"
-    DEBUG = "debug"
-    INFO = "info"
-    WARNING = "warning"
-
-
-@dataclass
-class ValidationError:
-
-    msg: str = field(default="Validation error")
-
-    error_type: ErrorType = field(default=ErrorType.INFO)
-
-    def __str__(self):
-        return f"[{str(self.error_type).upper()}] : {self.msg}"
-
-
-@dataclass
-class ValidationObjectError(ValidationError):
-
-    target_obj: Any = field(default=None)
-
-    attribute_dot_path: str = field(default=None)
-
-    def __str__(self):
-        return f"{ValidationError.__str__(self)}\n\t{get_obj_identifier(self.target_obj)} : '{self.attribute_dot_path}'"
-
-
-@dataclass
-class MandatoryError(ValidationObjectError):
-    def __str__(self):
-        return f"{ValidationError.__str__(self)}\n\tMandatory value is None for {get_obj_identifier(self.target_obj)} : '{self.attribute_dot_path}'"
-
-
-def validate_epc(epc: Epc) -> List[ValidationError]:
-    """
-    Verify if all :param:`epc`'s objects are valid.
-    :param epc:
-    :return:
-    """
-    errs = []
-    for obj in epc.energyml_objects:
-        errs = errs + patterns_verification(obj)
-
-    errs = errs + dor_verification(epc.energyml_objects)
-
-    return errs
-
-
-def dor_verification(energyml_objects: List[Any]) -> List[ValidationError]:
-    """
-    Verification for DOR. An error is raised if DORs contains wrong information, or if a referenced object is unknown
-    in the :param:`epc`.
-    :param energyml_objects:
-    :return:
-    """
-    errs = []
-
-    dict_obj_identifier = {
-        get_obj_identifier(obj): obj for obj in energyml_objects
-    }
-    dict_obj_uuid = {}
-    for obj in energyml_objects:
-        uuid = get_obj_uuid(obj)
-        if uuid not in dict_obj_uuid:
-            dict_obj_uuid[uuid] = []
-        dict_obj_uuid[uuid].append(obj)
-
-    # TODO: chercher dans les objets les AbstractObject (en Witsml des sous objet peuvent etre aussi references)
-
-    for obj in energyml_objects:
-        dor_list = search_attribute_matching_type_with_path(
-            obj, "DataObjectReference"
-        )
-        for dor_path, dor in dor_list:
-            dor_target_id = get_obj_identifier(dor)
-            if dor_target_id not in dict_obj_identifier:
-                dor_uuid = get_obj_uuid(dor)
-                dor_version = get_obj_version(dor)
-                if dor_uuid not in dict_obj_uuid:
-                    errs.append(
-                        ValidationObjectError(
-                            error_type=ErrorType.CRITICAL,
-                            target_obj=obj,
-                            attribute_dot_path=dor_path,
-                            msg=f"[DOR ERR] has wrong information. Unkown object with uuid '{dor_uuid}'",
-                        )
-                    )
-                else:
-                    accessible_version = [
-                        get_obj_version(ref_obj)
-                        for ref_obj in dict_obj_uuid[dor_uuid]
-                    ]
-                    errs.append(
-                        ValidationObjectError(
-                            error_type=ErrorType.CRITICAL,
-                            target_obj=obj,
-                            attribute_dot_path=dor_path,
-                            msg=f"[DOR ERR] has wrong information. Unkown object version '{dor_version}'. "
-                            f"Version must be one of {accessible_version}",
-                        )
-                    )
-            else:
-                target = dict_obj_identifier[dor_target_id]
-                target_title = get_object_attribute_rgx(
-                    target, "citation.title"
-                )
-                target_content_type = get_content_type_from_class(target)
-                target_qualified_type = get_qualified_type_from_class(target)
-
-                dor_title = get_object_attribute_rgx(dor, "title")
-
-                if dor_title != target_title:
-                    errs.append(
-                        ValidationObjectError(
-                            error_type=ErrorType.CRITICAL,
-                            target_obj=obj,
-                            attribute_dot_path=dor_path,
-                            msg=f"[DOR ERR] has wrong information. Title should be '{target_title}' and not '{dor_title}'",
-                        )
-                    )
-
-                if (
-                    get_matching_class_attribute_name(dor, "content_type")
-                    is not None
-                ):
-                    dor_content_type = get_object_attribute_no_verif(
-                        dor, "content_type"
-                    )
-                    if dor_content_type != target_content_type:
-                        errs.append(
-                            ValidationObjectError(
-                                error_type=ErrorType.CRITICAL,
-                                target_obj=obj,
-                                attribute_dot_path=dor_path,
-                                msg=f"[DOR ERR] has wrong information. ContentType should be '{target_content_type}' and not '{dor_content_type}'",
-                            )
-                        )
-
-                if (
-                    get_matching_class_attribute_name(dor, "qualified_type")
-                    is not None
-                ):
-                    dor_qualified_type = get_object_attribute_no_verif(
-                        dor, "qualified_type"
-                    )
-                    if dor_qualified_type != target_qualified_type:
-                        errs.append(
-                            ValidationObjectError(
-                                error_type=ErrorType.CRITICAL,
-                                target_obj=obj,
-                                attribute_dot_path=dor_path,
-                                msg=f"[DOR ERR] has wrong information. QualifiedType should be '{target_qualified_type}' and not '{dor_qualified_type}'",
-                            )
-                        )
-
-    return errs
-
-
-def patterns_verification(obj: Any) -> List[ValidationError]:
-    """
-    Verification on object values, using the patterns defined in the original energyml xsd files.
-    :param obj:
-    :return:
-    """
-    return _patterns_verification(obj, obj, "")
-
-
-def _patterns_verification(
-    obj: Any, root_obj: Any, current_attribute_dot_path: str = ""
-) -> List[ValidationError]:
-    """
-    Verification on object values, using the patterns defined in the original energyml xsd files.
-    :param obj:
-    :param root_obj:
-    :param current_attribute_dot_path:
-    :return:
-    """
-    error_list = []
-
-    if isinstance(obj, list):
-        cpt = 0
-        for val in obj:
-            error_list = error_list + _patterns_verification(
-                val, root_obj, f"{current_attribute_dot_path}.{cpt}"
-            )
-            cpt = cpt + 1
-    elif isinstance(obj, dict):
-        for k, val in obj.items():
-            error_list = error_list + _patterns_verification(
-                val, root_obj, f"{current_attribute_dot_path}.{k}"
-            )
-    else:
-        # print(get_class_fields(obj))
-        for att_name, att_field in get_class_fields(obj).items():
-            # print(f"att_name : {att_field.metadata}")
-            error_list = error_list + validate_attribute(
-                get_object_attribute(obj, att_name, False),
-                root_obj,
-                att_field,
-                f"{current_attribute_dot_path}.{att_name}",
-            )
-
-    return error_list
-
-
-def validate_attribute(
-    value: Any, root_obj: Any, att_field: Field, path: str
-) -> List[ValidationError]:
-    errs = []
-
-    if value is None:
-        if att_field.metadata.get("required", False):
-            errs.append(
-                MandatoryError(
-                    error_type=ErrorType.CRITICAL,
-                    target_obj=root_obj,
-                    attribute_dot_path=path,
-                )
-            )
-    else:
-        min_length = att_field.metadata.get("min_length", None)
-        max_length = att_field.metadata.get("max_length", None)
-        pattern = att_field.metadata.get("pattern", None)
-        min_occurs = att_field.metadata.get("pattern", None)
-        min_inclusive = att_field.metadata.get("pattern", None)
-        # white_space
-
-        if max_length is not None:
-            length = len(value)
-            if length > max_length:
-                errs.append(
-                    ValidationObjectError(
-                        error_type=ErrorType.CRITICAL,
-                        target_obj=root_obj,
-                        attribute_dot_path=path,
-                        msg=f"Max length was {max_length} but found {length}",
-                    )
-                )
-
-        if min_length is not None:
-            length = len(value)
-            if length < min_length:
-                errs.append(
-                    ValidationObjectError(
-                        error_type=ErrorType.CRITICAL,
-                        target_obj=root_obj,
-                        attribute_dot_path=path,
-                        msg=f"Max length was {min_length} but found {length}",
-                    )
-                )
-
-        if min_occurs is not None:
-            if isinstance(value, list) and min_occurs > len(value):
-                errs.append(
-                    ValidationObjectError(
-                        error_type=ErrorType.CRITICAL,
-                        target_obj=root_obj,
-                        attribute_dot_path=path,
-                        msg=f"Min occurs was {min_occurs} but found {len(value)}",
-                    )
-                )
-
-        if min_inclusive is not None:
-            potential_err = ValidationObjectError(
-                error_type=ErrorType.CRITICAL,
-                target_obj=root_obj,
-                attribute_dot_path=path,
-                msg=f"Min occurs was {min_inclusive} but found {len(value)}",
-            )
-            if isinstance(value, list):
-                for val in value:
-                    if (
-                            (isinstance(val, str) and len(val) > min_inclusive)
-                            or ((isinstance(val, int) or isinstance(val, float)) and val > min_inclusive)
-                    ):
-                        errs.append(potential_err)
-
-        if pattern is not None:
-            if re.match(pattern, value) is None:
-                errs.append(
-                    ValidationObjectError(
-                        error_type=ErrorType.CRITICAL,
-                        target_obj=root_obj,
-                        attribute_dot_path=path,
-                        msg=f"Pattern error. Value '{value}' was supposed to respect pattern '{pattern}'",
-                    )
-                )
-
-    return errs + _patterns_verification(
-        obj=value,
-        root_obj=root_obj,
-        current_attribute_dot_path=path,
-    )
-
-
-def correct_dor(energyml_objects: List[Any]) -> None:
-    """
-    Fix DOR errors (missing object_version, wrong title, wrong content-type/qualified-type ...)
-    :param energyml_objects:
-    :return:
-    """
-    dict_obj_identifier = {
-        get_obj_identifier(obj): obj for obj in energyml_objects
-    }
-    dict_obj_uuid = {}
-    for obj in energyml_objects:
-        uuid = get_obj_uuid(obj)
-        if uuid not in dict_obj_uuid:
-            dict_obj_uuid[uuid] = []
-        dict_obj_uuid[uuid].append(obj)
-
-    # TODO: chercher dans les objets les AbstractObject (en Witsml des sous objet peuvent etre aussi references)
-
-    for obj in energyml_objects:
-        dor_list = search_attribute_matching_type_with_path(
-            obj, "DataObjectReference"
-        )
-        for dor_path, dor in dor_list:
-            dor_target_id = get_obj_identifier(dor)
-            if dor_target_id in dict_obj_identifier:
-                target = dict_obj_identifier[dor_target_id]
-                target_title = get_object_attribute_rgx(
-                    target, "citation.title"
-                )
-                target_content_type = get_content_type_from_class(target)
-                target_qualified_type = get_qualified_type_from_class(target)
-
-                dor_title = get_object_attribute_rgx(dor, "title")
-
-                if dor_title != target_title:
-                    dor.title = target_title
-
-                if (
-                    get_matching_class_attribute_name(dor, "content_type")
-                    is not None
-                ):
-                    dor_content_type = get_object_attribute_no_verif(
-                        dor, "content_type"
-                    )
-                    if dor_content_type != target_content_type:
-                        dor.content_type = target_content_type
-
-                if (
-                    get_matching_class_attribute_name(dor, "qualified_type")
-                    is not None
-                ):
-                    dor_qualified_type = get_object_attribute_no_verif(
-                        dor, "qualified_type"
-                    )
-                    if dor_qualified_type != target_qualified_type:
-                        dor.qualified_type = target_qualified_type
-
-
-
-
-
-
-
-

Functions

-
-
-def correct_dor(energyml_objects: List[Any]) ‑> None -
-
-

Fix DOR errors (missing object_version, wrong title, wrong content-type/qualified-type …) -:param energyml_objects: -:return:

-
- -Expand source code - -
def correct_dor(energyml_objects: List[Any]) -> None:
-    """
-    Fix DOR errors (missing object_version, wrong title, wrong content-type/qualified-type ...)
-    :param energyml_objects:
-    :return:
-    """
-    dict_obj_identifier = {
-        get_obj_identifier(obj): obj for obj in energyml_objects
-    }
-    dict_obj_uuid = {}
-    for obj in energyml_objects:
-        uuid = get_obj_uuid(obj)
-        if uuid not in dict_obj_uuid:
-            dict_obj_uuid[uuid] = []
-        dict_obj_uuid[uuid].append(obj)
-
-    # TODO: chercher dans les objets les AbstractObject (en Witsml des sous objet peuvent etre aussi references)
-
-    for obj in energyml_objects:
-        dor_list = search_attribute_matching_type_with_path(
-            obj, "DataObjectReference"
-        )
-        for dor_path, dor in dor_list:
-            dor_target_id = get_obj_identifier(dor)
-            if dor_target_id in dict_obj_identifier:
-                target = dict_obj_identifier[dor_target_id]
-                target_title = get_object_attribute_rgx(
-                    target, "citation.title"
-                )
-                target_content_type = get_content_type_from_class(target)
-                target_qualified_type = get_qualified_type_from_class(target)
-
-                dor_title = get_object_attribute_rgx(dor, "title")
-
-                if dor_title != target_title:
-                    dor.title = target_title
-
-                if (
-                    get_matching_class_attribute_name(dor, "content_type")
-                    is not None
-                ):
-                    dor_content_type = get_object_attribute_no_verif(
-                        dor, "content_type"
-                    )
-                    if dor_content_type != target_content_type:
-                        dor.content_type = target_content_type
-
-                if (
-                    get_matching_class_attribute_name(dor, "qualified_type")
-                    is not None
-                ):
-                    dor_qualified_type = get_object_attribute_no_verif(
-                        dor, "qualified_type"
-                    )
-                    if dor_qualified_type != target_qualified_type:
-                        dor.qualified_type = target_qualified_type
-
-
-
-def dor_verification(energyml_objects: List[Any]) ‑> List[ValidationError] -
-
-

Verification for DOR. An error is raised if DORs contains wrong information, or if a referenced object is unknown -in the :param:epc. -:param energyml_objects: -:return:

-
- -Expand source code - -
def dor_verification(energyml_objects: List[Any]) -> List[ValidationError]:
-    """
-    Verification for DOR. An error is raised if DORs contains wrong information, or if a referenced object is unknown
-    in the :param:`epc`.
-    :param energyml_objects:
-    :return:
-    """
-    errs = []
-
-    dict_obj_identifier = {
-        get_obj_identifier(obj): obj for obj in energyml_objects
-    }
-    dict_obj_uuid = {}
-    for obj in energyml_objects:
-        uuid = get_obj_uuid(obj)
-        if uuid not in dict_obj_uuid:
-            dict_obj_uuid[uuid] = []
-        dict_obj_uuid[uuid].append(obj)
-
-    # TODO: chercher dans les objets les AbstractObject (en Witsml des sous objet peuvent etre aussi references)
-
-    for obj in energyml_objects:
-        dor_list = search_attribute_matching_type_with_path(
-            obj, "DataObjectReference"
-        )
-        for dor_path, dor in dor_list:
-            dor_target_id = get_obj_identifier(dor)
-            if dor_target_id not in dict_obj_identifier:
-                dor_uuid = get_obj_uuid(dor)
-                dor_version = get_obj_version(dor)
-                if dor_uuid not in dict_obj_uuid:
-                    errs.append(
-                        ValidationObjectError(
-                            error_type=ErrorType.CRITICAL,
-                            target_obj=obj,
-                            attribute_dot_path=dor_path,
-                            msg=f"[DOR ERR] has wrong information. Unkown object with uuid '{dor_uuid}'",
-                        )
-                    )
-                else:
-                    accessible_version = [
-                        get_obj_version(ref_obj)
-                        for ref_obj in dict_obj_uuid[dor_uuid]
-                    ]
-                    errs.append(
-                        ValidationObjectError(
-                            error_type=ErrorType.CRITICAL,
-                            target_obj=obj,
-                            attribute_dot_path=dor_path,
-                            msg=f"[DOR ERR] has wrong information. Unkown object version '{dor_version}'. "
-                            f"Version must be one of {accessible_version}",
-                        )
-                    )
-            else:
-                target = dict_obj_identifier[dor_target_id]
-                target_title = get_object_attribute_rgx(
-                    target, "citation.title"
-                )
-                target_content_type = get_content_type_from_class(target)
-                target_qualified_type = get_qualified_type_from_class(target)
-
-                dor_title = get_object_attribute_rgx(dor, "title")
-
-                if dor_title != target_title:
-                    errs.append(
-                        ValidationObjectError(
-                            error_type=ErrorType.CRITICAL,
-                            target_obj=obj,
-                            attribute_dot_path=dor_path,
-                            msg=f"[DOR ERR] has wrong information. Title should be '{target_title}' and not '{dor_title}'",
-                        )
-                    )
-
-                if (
-                    get_matching_class_attribute_name(dor, "content_type")
-                    is not None
-                ):
-                    dor_content_type = get_object_attribute_no_verif(
-                        dor, "content_type"
-                    )
-                    if dor_content_type != target_content_type:
-                        errs.append(
-                            ValidationObjectError(
-                                error_type=ErrorType.CRITICAL,
-                                target_obj=obj,
-                                attribute_dot_path=dor_path,
-                                msg=f"[DOR ERR] has wrong information. ContentType should be '{target_content_type}' and not '{dor_content_type}'",
-                            )
-                        )
-
-                if (
-                    get_matching_class_attribute_name(dor, "qualified_type")
-                    is not None
-                ):
-                    dor_qualified_type = get_object_attribute_no_verif(
-                        dor, "qualified_type"
-                    )
-                    if dor_qualified_type != target_qualified_type:
-                        errs.append(
-                            ValidationObjectError(
-                                error_type=ErrorType.CRITICAL,
-                                target_obj=obj,
-                                attribute_dot_path=dor_path,
-                                msg=f"[DOR ERR] has wrong information. QualifiedType should be '{target_qualified_type}' and not '{dor_qualified_type}'",
-                            )
-                        )
-
-    return errs
-
-
-
-def patterns_verification(obj: Any) ‑> List[ValidationError] -
-
-

Verification on object values, using the patterns defined in the original energyml xsd files. -:param obj: -:return:

-
- -Expand source code - -
def patterns_verification(obj: Any) -> List[ValidationError]:
-    """
-    Verification on object values, using the patterns defined in the original energyml xsd files.
-    :param obj:
-    :return:
-    """
-    return _patterns_verification(obj, obj, "")
-
-
-
-def validate_attribute(value: Any, root_obj: Any, att_field: dataclasses.Field, path: str) ‑> List[ValidationError] -
-
-
-
- -Expand source code - -
def validate_attribute(
-    value: Any, root_obj: Any, att_field: Field, path: str
-) -> List[ValidationError]:
-    errs = []
-
-    if value is None:
-        if att_field.metadata.get("required", False):
-            errs.append(
-                MandatoryError(
-                    error_type=ErrorType.CRITICAL,
-                    target_obj=root_obj,
-                    attribute_dot_path=path,
-                )
-            )
-    else:
-        min_length = att_field.metadata.get("min_length", None)
-        max_length = att_field.metadata.get("max_length", None)
-        pattern = att_field.metadata.get("pattern", None)
-        min_occurs = att_field.metadata.get("pattern", None)
-        min_inclusive = att_field.metadata.get("pattern", None)
-        # white_space
-
-        if max_length is not None:
-            length = len(value)
-            if length > max_length:
-                errs.append(
-                    ValidationObjectError(
-                        error_type=ErrorType.CRITICAL,
-                        target_obj=root_obj,
-                        attribute_dot_path=path,
-                        msg=f"Max length was {max_length} but found {length}",
-                    )
-                )
-
-        if min_length is not None:
-            length = len(value)
-            if length < min_length:
-                errs.append(
-                    ValidationObjectError(
-                        error_type=ErrorType.CRITICAL,
-                        target_obj=root_obj,
-                        attribute_dot_path=path,
-                        msg=f"Max length was {min_length} but found {length}",
-                    )
-                )
-
-        if min_occurs is not None:
-            if isinstance(value, list) and min_occurs > len(value):
-                errs.append(
-                    ValidationObjectError(
-                        error_type=ErrorType.CRITICAL,
-                        target_obj=root_obj,
-                        attribute_dot_path=path,
-                        msg=f"Min occurs was {min_occurs} but found {len(value)}",
-                    )
-                )
-
-        if min_inclusive is not None:
-            potential_err = ValidationObjectError(
-                error_type=ErrorType.CRITICAL,
-                target_obj=root_obj,
-                attribute_dot_path=path,
-                msg=f"Min occurs was {min_inclusive} but found {len(value)}",
-            )
-            if isinstance(value, list):
-                for val in value:
-                    if (
-                            (isinstance(val, str) and len(val) > min_inclusive)
-                            or ((isinstance(val, int) or isinstance(val, float)) and val > min_inclusive)
-                    ):
-                        errs.append(potential_err)
-
-        if pattern is not None:
-            if re.match(pattern, value) is None:
-                errs.append(
-                    ValidationObjectError(
-                        error_type=ErrorType.CRITICAL,
-                        target_obj=root_obj,
-                        attribute_dot_path=path,
-                        msg=f"Pattern error. Value '{value}' was supposed to respect pattern '{pattern}'",
-                    )
-                )
-
-    return errs + _patterns_verification(
-        obj=value,
-        root_obj=root_obj,
-        current_attribute_dot_path=path,
-    )
-
-
-
-def validate_epc(epc: Epc) ‑> List[ValidationError] -
-
-

Verify if all :param:epc's objects are valid. -:param epc: -:return:

-
- -Expand source code - -
def validate_epc(epc: Epc) -> List[ValidationError]:
-    """
-    Verify if all :param:`epc`'s objects are valid.
-    :param epc:
-    :return:
-    """
-    errs = []
-    for obj in epc.energyml_objects:
-        errs = errs + patterns_verification(obj)
-
-    errs = errs + dor_verification(epc.energyml_objects)
-
-    return errs
-
-
-
-
-
-

Classes

-
-
-class ErrorType -(*args, **kwds) -
-
-

Create a collection of name/value pairs.

-

Example enumeration:

-
>>> class Color(Enum):
-...     RED = 1
-...     BLUE = 2
-...     GREEN = 3
-
-

Access them by:

-
    -
  • attribute access::
  • -
-
>>> Color.RED
-<Color.RED: 1>
-
-
    -
  • value lookup:
  • -
-
>>> Color(1)
-<Color.RED: 1>
-
-
    -
  • name lookup:
  • -
-
>>> Color['RED']
-<Color.RED: 1>
-
-

Enumerations can be iterated over, and know how many members they have:

-
>>> len(Color)
-3
-
-
>>> list(Color)
-[<Color.RED: 1>, <Color.BLUE: 2>, <Color.GREEN: 3>]
-
-

Methods can be added to enumerations, and members can have their own -attributes – see the documentation for details.

-
- -Expand source code - -
class ErrorType(Enum):
-    CRITICAL = "critical"
-    DEBUG = "debug"
-    INFO = "info"
-    WARNING = "warning"
-
-

Ancestors

-
    -
  • enum.Enum
  • -
-

Class variables

-
-
var CRITICAL
-
-
-
-
var DEBUG
-
-
-
-
var INFO
-
-
-
-
var WARNING
-
-
-
-
-
-
-class MandatoryError -(msg: str = 'Validation error', error_type: ErrorType = ErrorType.INFO, target_obj: Any = None, attribute_dot_path: str = None) -
-
-

MandatoryError(msg: str = 'Validation error', error_type: src.energyml.utils.validation.ErrorType = , target_obj: Any = None, attribute_dot_path: str = None)

-
- -Expand source code - -
@dataclass
-class MandatoryError(ValidationObjectError):
-    def __str__(self):
-        return f"{ValidationError.__str__(self)}\n\tMandatory value is None for {get_obj_identifier(self.target_obj)} : '{self.attribute_dot_path}'"
-
-

Ancestors

- -
-
-class ValidationError -(msg: str = 'Validation error', error_type: ErrorType = ErrorType.INFO) -
-
-

ValidationError(msg: str = 'Validation error', error_type: src.energyml.utils.validation.ErrorType = )

-
- -Expand source code - -
@dataclass
-class ValidationError:
-
-    msg: str = field(default="Validation error")
-
-    error_type: ErrorType = field(default=ErrorType.INFO)
-
-    def __str__(self):
-        return f"[{str(self.error_type).upper()}] : {self.msg}"
-
-

Subclasses

- -

Class variables

-
-
var error_typeErrorType
-
-
-
-
var msg : str
-
-
-
-
-
-
-class ValidationObjectError -(msg: str = 'Validation error', error_type: ErrorType = ErrorType.INFO, target_obj: Any = None, attribute_dot_path: str = None) -
-
-

ValidationObjectError(msg: str = 'Validation error', error_type: src.energyml.utils.validation.ErrorType = , target_obj: Any = None, attribute_dot_path: str = None)

-
- -Expand source code - -
@dataclass
-class ValidationObjectError(ValidationError):
-
-    target_obj: Any = field(default=None)
-
-    attribute_dot_path: str = field(default=None)
-
-    def __str__(self):
-        return f"{ValidationError.__str__(self)}\n\t{get_obj_identifier(self.target_obj)} : '{self.attribute_dot_path}'"
-
-

Ancestors

- -

Subclasses

- -

Class variables

-
-
var attribute_dot_path : str
-
-
-
-
var target_obj : Any
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/energyml-utils/docs/src/energyml/utils/xml.html b/energyml-utils/docs/src/energyml/utils/xml.html deleted file mode 100644 index a2100c4..0000000 --- a/energyml-utils/docs/src/energyml/utils/xml.html +++ /dev/null @@ -1,501 +0,0 @@ - - - - - - -src.energyml.utils.xml API documentation - - - - - - - - - - - -
-
-
-

Module src.energyml.utils.xml

-
-
-
- -Expand source code - -
# Copyright (c) 2023-2024 Geosiris.
-# SPDX-License-Identifier: Apache-2.0
-import re
-from io import BytesIO
-from typing import Optional, Any, Union
-
-from lxml import etree as ETREE  # type: Any
-
-ENERGYML_NAMESPACES = {
-    "eml": "http://www.energistics.org/energyml/data/commonv2",
-    "prodml": "http://www.energistics.org/energyml/data/prodmlv2",
-    "witsml": "http://www.energistics.org/energyml/data/witsmlv2",
-    "resqml": "http://www.energistics.org/energyml/data/resqmlv2",
-}
-"""
-dict of all energyml namespaces
-"""  # pylint: disable=W0105
-
-ENERGYML_NAMESPACES_PACKAGE = {
-    "eml": ["http://www.energistics.org/energyml/data/commonv2"],
-    "prodml": ["http://www.energistics.org/energyml/data/prodmlv2"],
-    "witsml": ["http://www.energistics.org/energyml/data/witsmlv2"],
-    "resqml": ["http://www.energistics.org/energyml/data/resqmlv2"],
-    "opc": [
-        "http://schemas.openxmlformats.org/package/2006/content-types",
-        "http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
-    ],
-}
-"""
-dict of all energyml namespace packages
-"""  # pylint: disable=W0105
-
-RGX_UUID_NO_GRP = (
-    r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}"
-)
-RGX_UUID = r"(?P<uuid>" + RGX_UUID_NO_GRP + ")"
-RGX_DOMAIN_VERSION = r"(?P<domainVersion>(?P<versionNum>([\d]+[\._])*\d)\s*(?P<dev>dev\s*(?P<devNum>[\d]+))?)"
-RGX_DOMAIN_VERSION_FLAT = r"(?P<domainVersion>(?P<versionNumFlat>([\d]+)*\d)\s*(?P<dev>dev\s*(?P<devNum>[\d]+))?)"
-
-
-# ContentType
-RGX_MIME_TYPE_MEDIA = r"(?P<media>application|audio|font|example|image|message|model|multipart|text|video)"
-RGX_CT_ENERGYML_DOMAIN = r"(?P<energymlDomain>x-(?P<domain>[\w]+)\+xml)"
-RGX_CT_XML_DOMAIN = r"(?P<xmlRawDomain>(x\-)?(?P<xmlDomain>.+)\+xml)"
-RGX_CT_TOKEN_VERSION = r"version=" + RGX_DOMAIN_VERSION
-RGX_CT_TOKEN_TYPE = r"type=(?P<type>[\w\_]+)"
-
-RGX_CONTENT_TYPE = (
-        RGX_MIME_TYPE_MEDIA + "/"
-        + "(?P<rawDomain>(" + RGX_CT_ENERGYML_DOMAIN + ")|(" + RGX_CT_XML_DOMAIN + r")|([\w-]+\.?)+)"
-        + "(;((" + RGX_CT_TOKEN_VERSION + ")|(" + RGX_CT_TOKEN_TYPE + ")))*"
-)
-RGX_QUALIFIED_TYPE = (
-        r"(?P<domain>[a-zA-Z]+)" + RGX_DOMAIN_VERSION_FLAT + r"\.(?P<type>[\w_]+)"
-)
-# =========
-
-RGX_SCHEMA_VERSION = (
-        r"(?P<name>[eE]ml|[cC]ommon|[rR]esqml|[wW]itsml|[pP]rodml)?\s*v?"
-        + RGX_DOMAIN_VERSION
-        + r"\s*$"
-)
-
-RGX_ENERGYML_FILE_NAME_OLD = r"(?P<type>[\w]+)_" + RGX_UUID_NO_GRP + r"\.xml$"
-RGX_ENERGYML_FILE_NAME_NEW = (
-        RGX_UUID_NO_GRP + r"\.(?P<objectVersion>\d+(\.\d+)*)\.xml$"
-)
-RGX_ENERGYML_FILE_NAME = (
-    rf"^(.*/)?({RGX_ENERGYML_FILE_NAME_OLD})|({RGX_ENERGYML_FILE_NAME_NEW})"
-)
-
-RGX_XML_HEADER = r"^\s*\<\?xml\s+((encoding\s*=\s*\"(?P<encoding>[^\"]+)\"|version\s*=\s*\"(?P<version>[^\"]+)\"|standalone\s*=\s*\"(?P<standalone>[^\"]+)\")\s+)+"
-
-
-def get_pkg_from_namespace(namespace: str) -> Optional[str]:
-    for (k, v) in ENERGYML_NAMESPACES_PACKAGE.items():
-        if namespace in v:
-            return k
-    return None
-
-
-def is_energyml_content_type(content_type: str) -> bool:
-    ct = parse_content_type(content_type)
-    return ct.group("domain") is not None
-
-
-def get_root_namespace(tree: ETREE.Element) -> str:
-    return tree.nsmap[tree.prefix]
-
-
-def get_class_name_from_xml(tree: ETREE.Element) -> str:
-    root_namespace = get_root_namespace(tree)
-    pkg = get_pkg_from_namespace(root_namespace)
-    if pkg is None:
-        print(f"No pkg found for elt {tree}")
-    else:
-        if pkg == "opc":
-            return "energyml.opc.opc." + get_root_type(tree)
-        else:
-            schema_version = find_schema_version_in_element(tree).replace(".", "_").replace("-", "_")
-            if pkg == "resqml" and schema_version == "2_0":
-                schema_version = "2_0_1"
-            return ("energyml." + pkg
-                    + ".v" + schema_version
-                    + "."
-                    + root_namespace[root_namespace.rindex("/") + 1:]
-                    + "." + get_root_type(tree)
-                    )
-
-
-def get_xml_encoding(xml_content: str) -> Optional[str]:
-    try:
-        m = re.search(RGX_XML_HEADER, xml_content)
-        return m.group("encoding")
-    except AttributeError:
-        return "utf-8"
-
-
-def get_tree(xml_content: Union[bytes, str]) -> ETREE.Element:
-    xml_bytes = xml_content
-    if isinstance(xml_bytes, str):
-        xml_bytes = xml_content.encode(encoding=get_xml_encoding(xml_content).strip().lower())
-
-    return ETREE.parse(BytesIO(xml_bytes)).getroot()
-
-
-def energyml_xpath(tree: ETREE.Element, xpath: str) -> Optional[list]:
-    """A xpath research that knows energyml namespaces"""
-    try:
-        return ETREE.XPath(xpath, namespaces=ENERGYML_NAMESPACES)(tree)
-    except TypeError:
-        return None
-
-
-def search_element_has_child_xpath(tree: ETREE.Element, child_name: str) -> list:
-    """
-    Search elements that has a child named (xml tag) as 'child_name'.
-    Warning : child_name must contain the namespace (see. ENERGYML_NAMESPACES)
-    """
-    return list(x for x in energyml_xpath(tree, f"//{child_name}/.."))
-
-
-def get_uuid(tree: ETREE.Element) -> str:
-    _uuids = tree.xpath("@uuid")
-    if len(_uuids) <= 0:
-        _uuids = tree.xpath("@UUID")
-    if len(_uuids) <= 0:
-        _uuids = tree.xpath("@uid")
-    if len(_uuids) <= 0:
-        _uuids = tree.xpath("@UID")
-    return _uuids[0]
-
-
-def get_root_type(tree: ETREE.Element) -> str:
-    """ Returns the type (xml tag) of the element without the namespace """
-    return tree.xpath("local-name()")
-
-
-def find_schema_version_in_element(tree: ETREE.ElementTree) -> str:
-    """Find the "SchemaVersion" inside an xml content of a energyml file
-
-    :param tree: An energyml xml file content.
-    :type tree: bytes
-
-    :returns: The SchemaVersion that contains only the version number. For example, if the xml
-        file contains : SchemaVersion="Resqml 2.0.1"
-            the result will be : "2.0.1"
-    :rtype: str
-    """
-    _schema_version = tree.xpath("@schemaVersion")
-    if _schema_version is None:
-        _schema_version = tree.xpath("@SchemaVersion")
-
-    if _schema_version is not None:
-        match_version = re.search(r"\d+(\.\d+)*", _schema_version[0])
-        if match_version is not None:
-            return match_version.group(0)
-    return ""
-
-
-def parse_content_type(ct: str):
-    return re.search(RGX_CONTENT_TYPE, ct)
-
-
-
-
-
-

Global variables

-
-
var ENERGYML_NAMESPACES
-
-

dict of all energyml namespaces

-
-
var ENERGYML_NAMESPACES_PACKAGE
-
-

dict of all energyml namespace packages

-
-
-
-
-

Functions

-
-
-def energyml_xpath(tree: , xpath: str) ‑> Optional[list] -
-
-

A xpath research that knows energyml namespaces

-
- -Expand source code - -
def energyml_xpath(tree: ETREE.Element, xpath: str) -> Optional[list]:
-    """A xpath research that knows energyml namespaces"""
-    try:
-        return ETREE.XPath(xpath, namespaces=ENERGYML_NAMESPACES)(tree)
-    except TypeError:
-        return None
-
-
-
-def find_schema_version_in_element(tree: ) ‑> str -
-
-

Find the "SchemaVersion" inside an xml content of a energyml file

-

:param tree: An energyml xml file content. -:type tree: bytes

-

:returns: The SchemaVersion that contains only the version number. For example, if the xml -file contains : SchemaVersion="Resqml 2.0.1" -the result will be : "2.0.1" -:rtype: str

-
- -Expand source code - -
def find_schema_version_in_element(tree: ETREE.ElementTree) -> str:
-    """Find the "SchemaVersion" inside an xml content of a energyml file
-
-    :param tree: An energyml xml file content.
-    :type tree: bytes
-
-    :returns: The SchemaVersion that contains only the version number. For example, if the xml
-        file contains : SchemaVersion="Resqml 2.0.1"
-            the result will be : "2.0.1"
-    :rtype: str
-    """
-    _schema_version = tree.xpath("@schemaVersion")
-    if _schema_version is None:
-        _schema_version = tree.xpath("@SchemaVersion")
-
-    if _schema_version is not None:
-        match_version = re.search(r"\d+(\.\d+)*", _schema_version[0])
-        if match_version is not None:
-            return match_version.group(0)
-    return ""
-
-
-
-def get_class_name_from_xml(tree: ) ‑> str -
-
-
-
- -Expand source code - -
def get_class_name_from_xml(tree: ETREE.Element) -> str:
-    root_namespace = get_root_namespace(tree)
-    pkg = get_pkg_from_namespace(root_namespace)
-    if pkg is None:
-        print(f"No pkg found for elt {tree}")
-    else:
-        if pkg == "opc":
-            return "energyml.opc.opc." + get_root_type(tree)
-        else:
-            schema_version = find_schema_version_in_element(tree).replace(".", "_").replace("-", "_")
-            if pkg == "resqml" and schema_version == "2_0":
-                schema_version = "2_0_1"
-            return ("energyml." + pkg
-                    + ".v" + schema_version
-                    + "."
-                    + root_namespace[root_namespace.rindex("/") + 1:]
-                    + "." + get_root_type(tree)
-                    )
-
-
-
-def get_pkg_from_namespace(namespace: str) ‑> Optional[str] -
-
-
-
- -Expand source code - -
def get_pkg_from_namespace(namespace: str) -> Optional[str]:
-    for (k, v) in ENERGYML_NAMESPACES_PACKAGE.items():
-        if namespace in v:
-            return k
-    return None
-
-
-
-def get_root_namespace(tree: ) ‑> str -
-
-
-
- -Expand source code - -
def get_root_namespace(tree: ETREE.Element) -> str:
-    return tree.nsmap[tree.prefix]
-
-
-
-def get_root_type(tree: ) ‑> str -
-
-

Returns the type (xml tag) of the element without the namespace

-
- -Expand source code - -
def get_root_type(tree: ETREE.Element) -> str:
-    """ Returns the type (xml tag) of the element without the namespace """
-    return tree.xpath("local-name()")
-
-
-
-def get_tree(xml_content: Union[bytes, str]) ‑>  -
-
-
-
- -Expand source code - -
def get_tree(xml_content: Union[bytes, str]) -> ETREE.Element:
-    xml_bytes = xml_content
-    if isinstance(xml_bytes, str):
-        xml_bytes = xml_content.encode(encoding=get_xml_encoding(xml_content).strip().lower())
-
-    return ETREE.parse(BytesIO(xml_bytes)).getroot()
-
-
-
-def get_uuid(tree: ) ‑> str -
-
-
-
- -Expand source code - -
def get_uuid(tree: ETREE.Element) -> str:
-    _uuids = tree.xpath("@uuid")
-    if len(_uuids) <= 0:
-        _uuids = tree.xpath("@UUID")
-    if len(_uuids) <= 0:
-        _uuids = tree.xpath("@uid")
-    if len(_uuids) <= 0:
-        _uuids = tree.xpath("@UID")
-    return _uuids[0]
-
-
-
-def get_xml_encoding(xml_content: str) ‑> Optional[str] -
-
-
-
- -Expand source code - -
def get_xml_encoding(xml_content: str) -> Optional[str]:
-    try:
-        m = re.search(RGX_XML_HEADER, xml_content)
-        return m.group("encoding")
-    except AttributeError:
-        return "utf-8"
-
-
-
-def is_energyml_content_type(content_type: str) ‑> bool -
-
-
-
- -Expand source code - -
def is_energyml_content_type(content_type: str) -> bool:
-    ct = parse_content_type(content_type)
-    return ct.group("domain") is not None
-
-
-
-def parse_content_type(ct: str) -
-
-
-
- -Expand source code - -
def parse_content_type(ct: str):
-    return re.search(RGX_CONTENT_TYPE, ct)
-
-
-
-def search_element_has_child_xpath(tree: , child_name: str) ‑> list -
-
-

Search elements that has a child named (xml tag) as 'child_name'. -Warning : child_name must contain the namespace (see. ENERGYML_NAMESPACES)

-
- -Expand source code - -
def search_element_has_child_xpath(tree: ETREE.Element, child_name: str) -> list:
-    """
-    Search elements that has a child named (xml tag) as 'child_name'.
-    Warning : child_name must contain the namespace (see. ENERGYML_NAMESPACES)
-    """
-    return list(x for x in energyml_xpath(tree, f"//{child_name}/.."))
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/energyml-utils/docs/src/index.html b/energyml-utils/docs/src/index.html deleted file mode 100644 index 50b221a..0000000 --- a/energyml-utils/docs/src/index.html +++ /dev/null @@ -1,60 +0,0 @@ - - - - - - -src API documentation - - - - - - - - - - - -
-
-
-

Package src

-
-
-
-
-

Sub-modules

-
-
src.energyml
-
-
-
-
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/energyml-utils/example/attic/__init__.py b/energyml-utils/example/attic/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/energyml-utils/example/attic/arrays_test.py b/energyml-utils/example/attic/arrays_test.py new file mode 100644 index 0000000..ff06a67 --- /dev/null +++ b/energyml-utils/example/attic/arrays_test.py @@ -0,0 +1,498 @@ +import logging +import traceback +from typing import List, Optional +from energyml.utils.data.datasets_io import get_handler_registry +import numpy as np +from energyml.utils.data.helper import _ARRAY_NAMES_, read_array +from energyml.utils.data.mesh import ( + AbstractMesh, + SurfaceMesh, + PolylineSetMesh, + read_column_based_table, + read_mesh_object, + read_property_interpreted_with_cbt, + read_property, + read_time_series, +) +from energyml.utils.storage_interface import EnergymlStorageInterface +from energyml.utils.epc import Epc +from energyml.utils.epc_stream import EpcStreamReader, RelsUpdateMode +from energyml.utils.introspection import ( + get_obj_title, + search_attribute_matching_name, + get_object_attribute, + search_attribute_matching_name_with_path, +) +from energyml.resqml.v2_2.resqmlv2 import VerticalCRS +from energyml.resqml.v2_0_1.resqmlv2 import DiscreteProperty as DiscreteProperty201, ContinuousProperty as ContinuousProperty201 +from energyml.eml.v2_3.commonv2 import TimeSeries, ColumnBasedTable +from energyml.eml.v2_1.commonv2 import TimeSeries as TimeSeries21 + +from energyml.utils.serialization import read_energyml_xml_str, serialize_json + + +xml_Point3DLatticeArray = """ + + + 0.0 + 0.0 + 0.0 + + + + 0.0 + 1.0 + 0.0 + + + 1.0 + 99 + + + + + 1.0 + 0.0 + 0.0 + + + 1.0 + 49 + + + +""" + + +grid_2D = """ + + + 100x10 grid 2d for continuous color map + phili + 2026-02-13T16:55:42Z + F2I-CONSULTING:FESAPI Example:2.14.1.0 + + + 34b69c81-6cfa-4531-be5b-f6bd9b74802f + resqml22.HorizonInterpretation + Horizon interpretation for continuous color map + + map + 50 + 100 + + + 5c0703c5-3806-424e-86cf-8f59c8bb39fa + eml23.LocalEngineeringCompoundCrs + Default local CRS + + + + 0.0 + 0.0 + 0.0 + + + + 0.0 + 1.0 + 0.0 + + + 1.0 + 99 + + + + + 1.0 + 0.0 + 0.0 + + + 1.0 + 49 + + + + + +""" + +polyline_rep = """ + + + Horizon1 Interp1 SinglePolylineRep + phili + 2026-02-13T16:55:39Z + F2I-CONSULTING:FESAPI Example:2.14.1.0 + + + ac12dc12-4951-459b-b585-90f48aa88a5a + resqml22.HorizonInterpretation + Horizon1 Interp1 + + false + + + 5c0703c5-3806-424e-86cf-8f59c8bb39fa + eml23.LocalEngineeringCompoundCrs + Default local CRS + + + + + 12 + /resqml22/47f86668-27c4-4b28-a19e-bd0355321ecc/points_patch0 + 0 + testingPackageCpp22.h5 + application/x-hdf5 + + + + + + 5a371b9e-7202-42de-83a0-1b996d20586b + resqml22.PolylineRepresentation + Seismic line Rep + + + arrayOfFloat32LE + 1 + + + 4 + /resqml22/47f86668-27c4-4b28-a19e-bd0355321ecc/lineAbscissa_patch0 + 0 + testingPackageCpp22.h5 + application/x-hdf5 + + + + + + +""" + + +def read_grid() -> List[AbstractMesh]: + point3d_lattice_array = read_energyml_xml_str(xml_Point3DLatticeArray) + # print(point3d_lattice_array) + # point3d_lattice_array.value + if "DerivedElement" in str(type(point3d_lattice_array)): + point3d_lattice_array = point3d_lattice_array.value + print(serialize_json(point3d_lattice_array, check_obj_prefixed_classes=False)) + + print(np.array(read_array(point3d_lattice_array, None))) + + grid_2d = read_energyml_xml_str(grid_2D) + if "DerivedElement" in str(type(grid_2d)): + grid_2d = grid_2d.value + + meshes = read_mesh_object(grid_2d) + return meshes + + +def read_polyline() -> List[AbstractMesh]: + # polyline_representation = read_energyml_xml_str(polyline_rep) + # if "DerivedElement" in str(type(polyline_representation)): + # polyline_representation = polyline_representation.value + + # meshes = read_mesh_object(polyline_representation) + # return meshes + + epc = Epc.read_file("rc/epc/testingPackageCpp22.epc", read_rels_from_files=False, recompute_rels=False) + + polyline0 = epc.get_object_by_uuid("a54b8399-d3ba-4d4b-b215-8d4f8f537e66")[0] + # polyline0 = epc.get_object_by_uuid("65c59595-bf48-451e-94aa-120ebdf28d8b")[0] + # polyline0 = epc.get_object_by_uuid("47f86668-27c4-4b28-a19e-bd0355321ecc")[0] + print(polyline0) + print(epc.get_h5_file_paths(polyline0)) + + meshes = read_mesh_object(energyml_object=polyline0, workspace=epc) + + return meshes + + +def read_wellbore_frame_repr( + epc_path: str = "rc/epc/testingPackageCpp22.epc", + well_uuid: str = "d873e243-d893-41ab-9a3e-d20b851c099f", +) -> List[AbstractMesh]: + epc = Epc.read_file(f"{epc_path}", read_rels_from_files=False, recompute_rels=False) + + frame_repr = epc.get_object_by_uuid(well_uuid)[0] + # print(frame_repr) + # print(epc.get_h5_file_paths(frame_repr)) + + meshes = read_mesh_object(energyml_object=frame_repr, workspace=epc) + + # Previous result : + # points: + # [[ 0. 0. 0.] + # [ 0. 0. 250.] + # [ 0. 0. 500.] + # [ 0. 0. 750.] + # [ 0. 0. 1000.]] + # line indices: + # [[0 1] + # [1 2] + # [2 3] + # [3 4]] + + return meshes + + +def read_representation_set_representation() -> List[AbstractMesh]: + epc = Epc.read_file("rc/epc/testingPackageCpp22.epc", read_rels_from_files=False, recompute_rels=False) + + rep_set_rep = epc.get_object_by_uuid("6b992199-5b47-4624-a62c-b70857133cda")[0] + # print(rep_set_rep) + print(epc.get_h5_file_paths(rep_set_rep)) + + return read_mesh_object(energyml_object=rep_set_rep, workspace=epc) + + +def read_props_and_cbt( + epc_path: List[str] = [ + "rc/epc/testingPackageCpp22.epc", + "D:/Geosiris/Clients/BRGM/git/csv-to-energyml/rc/output/full-local/attic/result-out-EpcStream-egis-full.epc", + ], + p_or_cbt_uuids: List = [ + "1c5a3e99-e997-4bd7-a94d-c45d7b7405ce", + "be17c053-9189-4bc0-9db1-75aa51a026cd", + "da73937c-2c60-4e10-8917-5154fde4ded5", + "6561b499-82ed-4233-8a83-ea5d5aaf56a9", + "0d6aba60-b37e-498c-aedc-334561eb0749", + "d64d0ed0-72fa-4495-8e3a-a01175194e25", + "5abecfe6-b951-4802-9002-e597169a9923", + "49207072-563b-404a-9707-9a9b70168d33", + ], +) -> None: + + epcs = [] + for path in epc_path: + epc = EpcStreamReader( + epc_file_path=path, + rels_update_mode=RelsUpdateMode.MANUAL, + ) + # epc = Epc.read_file(f"{path}", read_rels_from_files=False, recompute_rels=False) + epcs.append(epc) + + for uuid in p_or_cbt_uuids: + read = False + prop_or_cbt = None + for epc in epcs: + try: + prop_or_cbt_lst = epc.get_object_by_uuid(uuid) + if not prop_or_cbt_lst: + continue + prop_or_cbt = prop_or_cbt_lst[0] + array = None + reshaped_array = None + if "column" in str(type(prop_or_cbt)).lower(): + array = read_column_based_table(prop_or_cbt, workspace=epc) + elif "time" in str(type(prop_or_cbt)).lower(): + array = read_time_series(prop_or_cbt, workspace=epc) + else: + array = read_property( + prop_or_cbt, + workspace=epc, + ) + reshaped_array = read_property_interpreted_with_cbt( + prop_or_cbt, + workspace=epc, + _cache_property_arrays=array, + _return_none_if_no_category_lookup=True, + ) + print("=" * 40) + # print("TS: ", search_attribute_matching_name(prop_or_cbt, "\\w*.time_series")) + # print(f"\t {get_object_attribute(prop_or_cbt, 'time_or_interval_series.time_series')}") + print(f"{type(prop_or_cbt)} : {get_obj_title(prop_or_cbt)} - uuid: {uuid}") + print(array) + + if reshaped_array is not None: + print(" # => interpreted array:") + print(reshaped_array) + + print("\n") + read = True + break + # except NotSupportedError as e: + # print(f"Object with uuid {uuid} found but not supported: {e}") + except Exception as e: + traceback.print_exc() + print(f"Error reading object with uuid {uuid}: {e}") + pass + if not read: + print("[E]" + "=" * 40) + if prop_or_cbt is not None: + print(f"Object with uuid {get_obj_title(prop_or_cbt)} found but could not be read.") + else: + print(f"Object with uuid {uuid} not found in any EPC file.") + print("\n") + + +def read_trset( + epc_path: str = "rc/epc/testingPackageCpp22.epc", trset_uuid: str = "6e678338-3b53-49b6-8801-faee493e0c42" +) -> List[AbstractMesh]: + epc = Epc.read_file(f"{epc_path}", read_rels_from_files=False, recompute_rels=False) + + trset = epc.get_object_by_uuid(trset_uuid)[0] + # print(trset) + # print(epc.get_h5_file_paths(trset)) + + meshes = read_mesh_object(energyml_object=trset, workspace=epc) + + return meshes + + +def print_tuple_list(tuple_list: List[tuple]) -> None: + for t in tuple_list: + print(t) + + +def read_pointset( + epc_path: str = "rc/epc/testingPackageCpp22.epc", pointset_uuid: str = "fbc5466c-94cd-46ab-8b48-2ae2162b372f" +) -> List[AbstractMesh]: + # epc = Epc.read_file(f"{epc_path}", read_rels_from_files=False, recompute_rels=False) + epc = EpcStreamReader( + epc_file_path=epc_path, + rels_update_mode=RelsUpdateMode.MANUAL, + ) + + pointset = epc.get_object_by_uuid(pointset_uuid)[0] + # print(pointset) + # print(epc.get_h5_file_paths(pointset)) + # meshes = [] + meshes = read_mesh_object(energyml_object=pointset, workspace=epc) + + print(epc.get_obj_rels(pointset)) + + # logging.debug("=" * 40) + # print_tuple_list(search_attribute_matching_name_with_path(pointset, r"NodePatch.[\d]+.Geometry.Points")) + # logging.debug("=" * 40) + # print_tuple_list( + # search_attribute_matching_name_with_path(pointset, r"NodePatchGeometry.[\d]+.Points") + # ) # resqml 2.0.1 + # logging.debug("=" * 40) + + return meshes + + +def read_wellbore_frame_repr_demo_jfr_02_26( + epc_path: str = r"rc/epc/out-galaxy-12-pts.epc", + well_uuid: str = "cfad9cb6-99fe-4172-b560-d2feca75dd9f", +) -> List[AbstractMesh]: + # epc = Epc.read_file(f"{epc_path}", read_rels_from_files=False, recompute_rels=False) + epc = EpcStreamReader(f"{epc_path}", rels_update_mode=RelsUpdateMode.MANUAL) + + frame_repr = epc.get_object_by_uuid(well_uuid)[0] + # print(frame_repr) + # print(epc.get_h5_file_paths(frame_repr)) + + print(epc.get_h5_file_paths()) + + print(epc.get_h5_file_paths(frame_repr)) + + print("Object type: ", type(frame_repr)) + + meshes = read_mesh_object(energyml_object=frame_repr, workspace=epc) + + # Previous result : + # points: + # [[ 0. 0. 0.] + # [ 0. 0. 250.] + # [ 0. 0. 500.] + # [ 0. 0. 750.] + # [ 0. 0. 1000.]] + # line indices: + # [[0 1] + # [1 2] + # [2 3] + # [3 4]] + + return meshes + + +def test_read_write_array(h5_path): + + handler_registry = get_handler_registry() + + h5_handler = handler_registry.get_handler_for_file(h5_path) + if h5_handler is None: + print(f"No handler found for file {h5_path}") + return + h5_handler.write_array( + array=np.array([[1, 2, 3], [4, 5, 6]]), + target=h5_path, + path_in_external_file="/test_array", + ) + + h5_handler.file_cache.close_all() + + print( + h5_handler.read_array( + source=h5_path, + path_in_external_file="/test_array", + ) + ) + + success = h5_handler.write_array( + array=np.array([[7, 8, 9], [10, 11, 12]]), + target=h5_path, + path_in_external_file="/test_array2", + ) + print(f"Write success: {success}") + + cached = h5_handler.file_cache.get_or_open(h5_path, h5_handler, "a") + # print if file is still opened : + print(f"File still opened after write: {cached} is open: {hasattr(cached, 'id') and cached.id.valid}") + + success = h5_handler.write_array( + array=np.array([[13, 14, 15], [16, 17, 18]]), + target=h5_path, + path_in_external_file="/test_array3", + ) + print(f"Write success: {success}") + + print( + h5_handler.read_array( + source=h5_path, + path_in_external_file="/test_array2", + ) + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + meshes = [] + # meshes = read_grid() + # meshes = read_polyline() + # meshes = read_wellbore_frame_repr() + # meshes = read_representation_set_representation() + # meshes = read_trset() + # meshes = read_pointset() + meshes = read_wellbore_frame_repr_demo_jfr_02_26() + + print(f"Number of meshes read: {len(meshes)}") + + if meshes: + for m in meshes: + print("=" * 40) + print(f"Mesh identifier: {m.identifier}") + print("points:") + print(np.array(m.point_list)) + + if isinstance(m, SurfaceMesh): + print("face indices:") + print(np.array(m.faces_indices)) + elif isinstance(m, PolylineSetMesh): + print("line indices:") + try: + print(np.array(m.line_indices)) + except Exception as e: + print(m.line_indices) + raise e + + # read_props_and_cbt() + # test_read_write_array("test_array_rw.h5") diff --git a/energyml-utils/example/attic/arrays_test_fast.py b/energyml-utils/example/attic/arrays_test_fast.py new file mode 100644 index 0000000..3d0c58d --- /dev/null +++ b/energyml-utils/example/attic/arrays_test_fast.py @@ -0,0 +1,437 @@ +""" +arrays_test_fast.py +=================== +Companion to arrays_test.py — but using the mesh_numpy module for zero-copy, +numpy-native geometry reading. + +Every function mirrors its counterpart in arrays_test.py and returns +``List[NumpyMesh]``. The ``__main__`` block at the bottom shows how to +toggle between the different readers and optionally render with PyVista. + +Key differences vs. arrays_test.py: +* No list-of-lists — everything is already an ``np.ndarray``. +* VTK flat format for faces / lines — passable directly to PyVista. +* ``use_crs_displacement=True`` applies the CRS offset/scale in-place (no + extra allocation). +* Optional ``numpy_mesh_to_pyvista()`` helper at the end of each function. +""" + +import logging +import os +import sys +import traceback +from pathlib import Path +from typing import List, Optional + +import numpy as np + + +from energyml.utils.data.datasets_io import get_handler_registry +from energyml.utils.data.mesh_numpy import ( + NumpyMesh, + NumpyPointSetMesh, + NumpyPolylineMesh, + NumpySurfaceMesh, + NumpyVolumeMesh, + read_numpy_mesh_object, + numpy_mesh_to_pyvista, +) +from energyml.utils.epc import Epc +from energyml.utils.epc_stream import EpcStreamReader, RelsUpdateMode +from energyml.utils.serialization import read_energyml_xml_str + +# --------------------------------------------------------------------------- +# Optional PyVista import — present only when the package is installed. +# --------------------------------------------------------------------------- +try: + import pyvista as pv + + _PYVISTA_AVAILABLE = True +except ImportError: + _PYVISTA_AVAILABLE = False + +# --------------------------------------------------------------------------- +# Embedded XML fixtures (same as arrays_test.py) +# --------------------------------------------------------------------------- + +xml_grid_2d = """ + + + 100x10 grid 2d for continuous color map + phili + 2026-02-13T16:55:42Z + F2I-CONSULTING:FESAPI Example:2.14.1.0 + + + 34b69c81-6cfa-4531-be5b-f6bd9b74802f + resqml22.HorizonInterpretation + Horizon interpretation for continuous color map + + map + 50 + 100 + + + 5c0703c5-3806-424e-86cf-8f59c8bb39fa + eml23.LocalEngineeringCompoundCrs + Default local CRS + + + + 0.0 + 0.0 + 0.0 + + + + 0.0 + 1.0 + 0.0 + + + 1.0 + 99 + + + + + 1.0 + 0.0 + 0.0 + + + 1.0 + 49 + + + + + +""" + + +# --------------------------------------------------------------------------- +# Helper: pretty-print a NumpyMesh +# --------------------------------------------------------------------------- + +def print_mesh(mesh: NumpyMesh, *, max_rows: int = 8) -> None: + """Print a short summary of *mesh* to stdout.""" + sep = "=" * 50 + print(sep) + print(f"Type : {type(mesh).__name__}") + print(f"Identifier : {mesh.identifier!r}") + print(f"Points : shape={mesh.points.shape} dtype={mesh.points.dtype}") + + # Show first max_rows rows so output stays readable. + head = mesh.points[:max_rows] + print(head) + if len(mesh.points) > max_rows: + print(f" ... ({len(mesh.points) - max_rows} more rows)") + + if isinstance(mesh, NumpySurfaceMesh): + print(f"Faces (VTK flat) : len={len(mesh.faces)} dtype={mesh.faces.dtype}") + print(mesh.faces[:min(len(mesh.faces), max_rows * 4)]) + + elif isinstance(mesh, NumpyPolylineMesh): + print(f"Lines (VTK flat) : len={len(mesh.lines)} dtype={mesh.lines.dtype}") + print(mesh.lines[:min(len(mesh.lines), max_rows * 3)]) + + elif isinstance(mesh, NumpyVolumeMesh): + print(f"Cells (VTK flat) : len={len(mesh.cells)} dtype={mesh.cells.dtype}") + print(f"Cell types : len={len(mesh.cell_types)} dtype={mesh.cell_types.dtype}") + + print() + + +# --------------------------------------------------------------------------- +# Reader functions — one per representation type +# --------------------------------------------------------------------------- + +def read_numpy_grid(use_crs_displacement: bool = False) -> List[NumpyMesh]: + """Read a Grid2dRepresentation from an embedded XML string (no EPC needed).""" + grid_2d = read_energyml_xml_str(xml_grid_2d) + if "DerivedElement" in str(type(grid_2d)): + grid_2d = grid_2d.value + + meshes = read_numpy_mesh_object( + energyml_object=grid_2d, + workspace=None, + use_crs_displacement=use_crs_displacement, + ) + return meshes + + +def read_numpy_polyline( + epc_path: str = "rc/epc/testingPackageCpp22.epc", + polyline_uuid: str = "a54b8399-d3ba-4d4b-b215-8d4f8f537e66", + use_crs_displacement: bool = True, +) -> List[NumpyMesh]: + """Read a PolylineRepresentation (or PolylineSetRepresentation) by UUID.""" + epc = Epc.read_file(epc_path, read_rels_from_files=False, recompute_rels=False) + + polyline_obj = epc.get_object_by_uuid(polyline_uuid)[0] + print(f"Object: {type(polyline_obj).__name__} uuid={polyline_uuid}") + + meshes = read_numpy_mesh_object( + energyml_object=polyline_obj, + workspace=epc, + use_crs_displacement=use_crs_displacement, + ) + return meshes + + +def read_numpy_trset( + epc_path: str = "rc/epc/testingPackageCpp22.epc", + trset_uuid: str = "6e678338-3b53-49b6-8801-faee493e0c42", + use_crs_displacement: bool = True, +) -> List[NumpyMesh]: + """Read a TriangulatedSetRepresentation by UUID.""" + epc = Epc.read_file(epc_path, read_rels_from_files=False, recompute_rels=False) + + trset = epc.get_object_by_uuid(trset_uuid)[0] + print(f"Object: {type(trset).__name__} uuid={trset_uuid}") + + meshes = read_numpy_mesh_object( + energyml_object=trset, + workspace=epc, + use_crs_displacement=use_crs_displacement, + ) + return meshes + + +def read_numpy_pointset( + epc_path: str = "rc/epc/testingPackageCpp22.epc", + pointset_uuid: str = "fbc5466c-94cd-46ab-8b48-2ae2162b372f", + use_crs_displacement: bool = True, +) -> List[NumpyMesh]: + """Read a PointSetRepresentation by UUID. + + Uses EpcStreamReader to exercise the streaming path (same as arrays_test.py). + """ + epc = EpcStreamReader( + epc_file_path=epc_path, + rels_update_mode=RelsUpdateMode.MANUAL, + ) + + pointset = epc.get_object_by_uuid(pointset_uuid)[0] + print(f"Object: {type(pointset).__name__} uuid={pointset_uuid}") + + meshes = read_numpy_mesh_object( + energyml_object=pointset, + workspace=epc, + use_crs_displacement=use_crs_displacement, + ) + return meshes + + +def read_numpy_wellbore_frame_repr( + epc_path: str = "rc/epc/testingPackageCpp22.epc", + well_uuid: str = "d873e243-d893-41ab-9a3e-d20b851c099f", + use_crs_displacement: bool = True, +) -> List[NumpyMesh]: + """Read a WellboreFrameRepresentation (or WellboreTrajectoryRepresentation).""" + epc = Epc.read_file(epc_path, read_rels_from_files=False, recompute_rels=False) + + frame_repr = epc.get_object_by_uuid(well_uuid)[0] + print(f"Object: {type(frame_repr).__name__} uuid={well_uuid}") + + meshes = read_numpy_mesh_object( + energyml_object=frame_repr, + workspace=epc, + use_crs_displacement=use_crs_displacement, + ) + return meshes + + +def read_numpy_representation_set( + epc_path: str = "rc/epc/testingPackageCpp22.epc", + rep_set_uuid: str = "6b992199-5b47-4624-a62c-b70857133cda", + use_crs_displacement: bool = True, +) -> List[NumpyMesh]: + """Read a RepresentationSetRepresentation — returns all member meshes.""" + epc = Epc.read_file(epc_path, read_rels_from_files=False, recompute_rels=False) + + rep_set = epc.get_object_by_uuid(rep_set_uuid)[0] + print(f"Object: {type(rep_set).__name__} uuid={rep_set_uuid}") + + meshes = read_numpy_mesh_object( + energyml_object=rep_set, + workspace=epc, + use_crs_displacement=use_crs_displacement, + ) + return meshes + + +def read_numpy_wellbore_frame_repr_demo_jfr_02_26( + epc_path: str = r"rc/epc/out-galaxy-12-pts.epc", + well_uuid: str = "cfad9cb6-99fe-4172-b560-d2feca75dd9f", + use_crs_displacement: bool = True, +) -> List[NumpyMesh]: + """Read a wellbore frame from a galaxy EPC file via the streaming reader.""" + epc = EpcStreamReader(epc_path, rels_update_mode=RelsUpdateMode.MANUAL) + + frame_repr = epc.get_object_by_uuid(well_uuid)[0] + print(f"Object: {type(frame_repr).__name__} uuid={well_uuid}") + + meshes = read_numpy_mesh_object( + energyml_object=frame_repr, + workspace=epc, + use_crs_displacement=use_crs_displacement, + ) + return meshes + + +# --------------------------------------------------------------------------- +# Zero-copy demo: compare read_array vs read_array_view +# --------------------------------------------------------------------------- + +def demo_zero_copy(h5_path: str = "rc/epc/testingPackageCpp22.h5") -> None: + """Show that read_array_view returns a numpy view instead of a copy. + + A view shares memory with the original HDF5 buffer — no extra allocation. + We confirm this by checking ``np.shares_memory`` and comparing dtype/shape. + """ + handler_registry = get_handler_registry() + h5_handler = handler_registry.get_handler_for_file(h5_path) + if h5_handler is None: + print(f"[demo_zero_copy] No handler found for {h5_path!r}") + return + + # Use a dataset that exists in the standard test EPC. + hdf5_path = "/resqml22/6e678338-3b53-49b6-8801-faee493e0c42/points_patch0" + + eager = h5_handler.read_array(source=h5_path, path_in_external_file=hdf5_path) + view = h5_handler.read_array_view(source=h5_path, path_in_external_file=hdf5_path) + + print("-" * 50) + print("demo_zero_copy") + print(f" Eager copy : shape={eager.shape} dtype={eager.dtype} id={id(eager)}") + print(f" View/array : shape={view.shape} dtype={view.dtype} id={id(view)}") + print(f" Same object : {eager is view}") + # For contiguous HDF5 datasets numpy may or may not share memory depending + # on the h5py version; we note what actually happened rather than asserting. + print(f" Shares memory: {np.shares_memory(eager, view)}") + print() + + +# --------------------------------------------------------------------------- +# Optional: write + read-back a test array (from arrays_test.py) +# --------------------------------------------------------------------------- + +def test_read_write_array_view(h5_path: str = "test_array_rw_fast.h5") -> None: + """Write two datasets then read them back via both eager and view paths.""" + handler_registry = get_handler_registry() + h5_handler = handler_registry.get_handler_for_file(h5_path) + if h5_handler is None: + print(f"No handler found for {h5_path}") + return + + for i, arr in enumerate([np.array([[1, 2, 3], [4, 5, 6]]), np.arange(24, dtype=np.float32).reshape(4, 6)]): + path = f"/test_dataset_{i}" + h5_handler.write_array(array=arr, target=h5_path, path_in_external_file=path) + h5_handler.file_cache.close_all() + + eager = h5_handler.read_array(source=h5_path, path_in_external_file=path) + view = h5_handler.read_array_view(source=h5_path, path_in_external_file=path) + + print(f"Dataset {path!r}:") + print(f" eager : {eager}") + print(f" view : {view}") + assert np.array_equal(eager, view), "Mismatch between eager and view!" + print(" [OK] values match\n") + + +# --------------------------------------------------------------------------- +# Optional: PyVista rendering +# --------------------------------------------------------------------------- + +def render_meshes_pyvista(meshes: List[NumpyMesh], title: str = "NumpyMesh viewer") -> None: + """Render a list of NumpyMesh objects in a PyVista plotter. + + Does nothing if pyvista is not installed. + """ + if not _PYVISTA_AVAILABLE: + print("[render_meshes_pyvista] pyvista not installed — skipping render.") + return + + plotter = pv.Plotter(title=title) + for mesh in meshes: + try: + pv_mesh = numpy_mesh_to_pyvista(mesh) + plotter.add_mesh(pv_mesh, show_edges=True, label=mesh.identifier or type(mesh).__name__) + except Exception as e: + print(f" [warn] Could not convert {type(mesh).__name__!r}: {e}") + + plotter.add_legend() + plotter.show() + + +# --------------------------------------------------------------------------- +# Entrypoint +# --------------------------------------------------------------------------- + +def main() -> None: + logging.basicConfig(level=logging.DEBUG) + + print("=" * 60) + print("arrays_test_fast.py — NumpyMesh reader demo") + print("=" * 60) + + # ------------------------------------------------------------------ + # Define which readers to run. + # Each entry is (label, callable). + # Comment / uncomment to control what gets exercised. + # ------------------------------------------------------------------ + readers = [ + ("Grid2dRepresentation (embedded XML)", read_numpy_grid), + ("PolylineRepresentation", read_numpy_polyline), + ("TriangulatedSetRepresentation", read_numpy_trset), + ("PointSetRepresentation", read_numpy_pointset), + ("WellboreFrameRepresentation", read_numpy_wellbore_frame_repr), + ("RepresentationSetRepresentation", read_numpy_representation_set), + # ("WellboreFrame (galaxy EPC)", read_numpy_wellbore_frame_repr_demo_jfr_02_26), + ] + + all_meshes: List[NumpyMesh] = [] + + for label, reader in readers: + print(f"\n{'─' * 60}") + print(f"Running: {label}") + print(f"{'─' * 60}") + try: + result = reader() + print(f" → {len(result)} mesh(es) returned") + all_meshes.extend(result) + for m in result: + print_mesh(m) + except Exception as exc: + print(f" [ERROR] {type(exc).__name__}: {exc}") + + # ------------------------------------------------------------------ + # Zero-copy comparison demo (reads directly from the HDF5 file): + # ------------------------------------------------------------------ + # demo_zero_copy() + + # ------------------------------------------------------------------ + # Round-trip write + read-back test: + # ------------------------------------------------------------------ + # test_read_write_array_view() + + print(f"\n{'=' * 60}") + print(f"Total meshes collected: {len(all_meshes)}") + print(f"{'=' * 60}\n") + + # ------------------------------------------------------------------ + # Optional PyVista render (only if pyvista is installed): + # ------------------------------------------------------------------ + # render_meshes_pyvista(all_meshes) + + +if __name__ == "__main__": + # Run $env:PYTHONPATH="src" if it fails to be executed from the project root. + print("hello") + main() diff --git a/energyml-utils/example/attic/compare_inmem_n_stream.py b/energyml-utils/example/attic/compare_inmem_n_stream.py new file mode 100644 index 0000000..190a1e4 --- /dev/null +++ b/energyml-utils/example/attic/compare_inmem_n_stream.py @@ -0,0 +1,179 @@ +import logging +import os +import shutil +import sys +import time +from typing import Optional + +from energyml.utils.epc_stream import EpcStreamReader, RelsUpdateMode +from energyml.utils.epc import Epc +from energyml.utils.epc_utils import update_prop_kind_dict_cache + + +def reexport_stream_seq(filepath: str, output_folder: Optional[str] = None): + path_seq = filepath.replace(".epc", "_stream_seq.epc") + if output_folder: + os.makedirs(output_folder, exist_ok=True) + path_seq = f"{output_folder}/{path_seq.split('/')[-1]}" + shutil.copy(filepath, path_seq) + with EpcStreamReader( + epc_file_path=path_seq, enable_parallel_rels=False, rels_update_mode=RelsUpdateMode.UPDATE_ON_CLOSE + ) as reader: + pass # Just open and close to trigger rels computation on close + + +def reexport_stream_parallel(filepath: str, output_folder: Optional[str] = None): + path_parallel = filepath.replace(".epc", "_stream_parallel.epc") + if output_folder: + os.makedirs(output_folder, exist_ok=True) + path_parallel = f"{output_folder}/{path_parallel.split('/')[-1]}" + shutil.copy(filepath, path_parallel) + with EpcStreamReader( + epc_file_path=path_parallel, enable_parallel_rels=True, rels_update_mode=RelsUpdateMode.UPDATE_ON_CLOSE + ) as reader: + pass # Just open and close to trigger rels computation on close + + +def reexport_in_memory(filepath: str, output_folder: Optional[str] = None): + path_in_memory = filepath.replace(".epc", "_in_memory.epc") + if output_folder: + os.makedirs(output_folder, exist_ok=True) + path_in_memory = f"{output_folder}/{path_in_memory.split('/')[-1]}" + epc = Epc.read_file(epc_file_path=filepath, read_rels_from_files=False, recompute_rels=False) + print(len(epc.list_objects())) + if os.path.exists(path_in_memory): + os.remove(path_in_memory) + epc.export_file(path_in_memory) + + +def reexport_in_memory_par_read(filepath: str, output_folder: Optional[str] = None): + path_in_memory = filepath.replace(".epc", f"_in_memory_par_read_v{os.environ['EPC_FAST_V2']}.epc") + if output_folder: + os.makedirs(output_folder, exist_ok=True) + path_in_memory = f"{output_folder}/{path_in_memory.split('/')[-1]}" + epc = Epc.read_file(epc_file_path=filepath, read_rels_from_files=False, read_parallel=True, recompute_rels=False) + print(len(epc.list_objects())) + if os.path.exists(path_in_memory): + os.remove(path_in_memory) + epc.export_file(path_in_memory, parallel=True) + + +def time_comparison( + filepath: str, + output_folder: Optional[str] = None, + skip_sequential_stream: bool = True, + skip_parallel_stream: bool = True, +): + """Compare performance of different EPC reexport methods.""" + print(f"\n{'=' * 70}") + print(f"Performance Comparison: {filepath.split('/')[-1]}") + print(f"{'=' * 70}\n") + + results = [] + + # Test 1: In-Memory + print("⏳ Testing In-Memory EPC processing...") + start = time.perf_counter() + reexport_in_memory(filepath, output_folder) + elapsed_inmem = time.perf_counter() - start + results.append(("In-Memory (Epc)", elapsed_inmem)) + print(f" ✓ Completed in {elapsed_inmem:.3f}s\n") + + # Test 1b: In-Memory with Parallel Read + os.environ["EPC_FAST_V2"] = "0" + print("⏳ Testing In-Memory EPC processing with Parallel Read...") + start = time.perf_counter() + reexport_in_memory_par_read(filepath, output_folder) + elapsed_inmem_par = time.perf_counter() - start + results.append(("In-Memory (Epc) Parallel Read", elapsed_inmem_par)) + print(f" ✓ Completed in {elapsed_inmem_par:.3f}s\n") + + # Test 1b: In-Memory with Parallel Read v2 + os.environ["EPC_FAST_V2"] = "1" + print("⏳ Testing In-Memory EPC processing with Parallel Read v2...") + start = time.perf_counter() + reexport_in_memory_par_read(filepath, output_folder) + elapsed_inmem_par = time.perf_counter() - start + results.append(("In-Memory (Epc) Parallel Read v2", elapsed_inmem_par)) + print(f" ✓ Completed in {elapsed_inmem_par:.3f}s\n") + + if not skip_sequential_stream: + # Test 2: Streaming Sequential + print("⏳ Testing Streaming Sequential processing...") + start = time.perf_counter() + reexport_stream_seq(filepath, output_folder) + elapsed_seq = time.perf_counter() - start + results.append(("Stream Sequential", elapsed_seq)) + print(f" ✓ Completed in {elapsed_seq:.3f}s\n") + + # Test 3: Streaming Parallel + if not skip_parallel_stream: + print("⏳ Testing Streaming Parallel processing...") + start = time.perf_counter() + reexport_stream_parallel(filepath, output_folder) + elapsed_parallel = time.perf_counter() - start + results.append(("Stream Parallel", elapsed_parallel)) + print(f" ✓ Completed in {elapsed_parallel:.3f}s\n") + + # Calculate speedups + results_sorted = sorted(results, key=lambda x: x[1]) + fastest_time = results_sorted[0][1] + + # Print fancy table + print(f"\n{'=' * 70}") + print(f"{'PERFORMANCE RESULTS':^70}") + print(f"{'=' * 70}") + print(f"{'Method':<25} {'Time (s)':>12} {'Speedup':>12} {'Status':>15}") + print(f"{'-' * 70}") + + for method, elapsed in results_sorted: + speedup = fastest_time / elapsed + if speedup >= 0.95: # Fastest + status = "🏆 FASTEST" + elif speedup >= 0.8: + status = "✓ Good" + else: + status = "○ Slower" + + print(f"{method:<25} {elapsed:>12.3f} {speedup:>12.2f}x {status:>15}") + + print(f"{'=' * 70}") + + # Summary + fastest_method = results_sorted[0][0] + slowest_method = results_sorted[-1][0] + speedup_factor = results_sorted[-1][1] / fastest_time + + print(f"\n📊 Summary:") + print(f" • Fastest: {fastest_method} ({fastest_time:.3f}s)") + print(f" • Slowest: {slowest_method} ({results_sorted[-1][1]:.3f}s)") + print(f" • Overall speedup: {speedup_factor:.2f}x faster\n") + + +def recompute_rels(epc_file_path: str): + with EpcStreamReader( + epc_file_path=epc_file_path, enable_parallel_rels=True, rels_update_mode=RelsUpdateMode.UPDATE_ON_CLOSE + ) as reader: + pass # Just open and close to trigger rels computation on close + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + + update_prop_kind_dict_cache() + + # time_comparison( + # filepath=sys.argv[1] if len(sys.argv) > 1 else "rc/epc/testingPackageCpp22.epc", + # output_folder="rc/performance_results", + # ) + + time_comparison( + filepath=sys.argv[1] if len(sys.argv) > 1 else "rc/epc/80wells_surf.epc", output_folder="rc/performance_results" + ) + + # time_comparison( + # filepath=sys.argv[1] if len(sys.argv) > 1 else "wip/failingData/fix/sample_mini_firp_201_norels_with_media.epc", + # output_folder="rc/performance_results", + # ) + + # recompute_rels("C:/Users/Cryptaro/Downloads/Galaxy384-[[Output] EPC file pointset extraction].epc") diff --git a/energyml-utils/example/attic/crs_info_from_epc.py b/energyml-utils/example/attic/crs_info_from_epc.py new file mode 100644 index 0000000..e1ddf82 --- /dev/null +++ b/energyml-utils/example/attic/crs_info_from_epc.py @@ -0,0 +1,467 @@ +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 +""" +Integration examples for :mod:`energyml.utils.data.crs`. + +Reads real EPC files from ``rc/epc/`` and exercises :func:`extract_crs_info` +against every CRS object they contain. Also shows how to walk from a +``Grid2DRepresentation`` to its CRS and call ``extract_crs_info`` on the +resolved object. + +Run from the workspace root:: + + poetry run python example/attic/crs_info_from_epc.py + +Expected output: all test cases show ``[PASS]``. +""" +from __future__ import annotations + +import logging +import sys +from pathlib import Path +from typing import Any, List, Optional + + +# Run $env:PYTHONPATH="src" if it fails to be executed from the project root. + +# ── make the local ``src/`` take precedence when running directly ────────── +sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) + +from energyml.utils.epc import Epc +from energyml.utils.introspection import get_obj_uuid, get_object_attribute_rgx +from energyml.utils.data.crs import CrsInfo, extract_crs_info + +# suppress noise from EPC loading +logging.basicConfig(level=logging.ERROR) + +# ── EPC file paths (relative to workspace root) ──────────────────────────── +_ROOT = Path(__file__).parent.parent.parent +EPC20_PATH = str(_ROOT / "rc" / "epc" / "testingPackageCpp.epc") +EPC22_PATH = str(_ROOT / "rc" / "epc" / "testingPackageCpp22.epc") + +# ── Simple test harness ──────────────────────────────────────────────────── +_passed = 0 +_failed = 0 + + +def check(label: str, expected: Any, actual: Any, *, approx: bool = False) -> None: + """Print PASS / FAIL and update counters.""" + global _passed, _failed + if approx: + import math + ok = (expected is None and actual is None) or ( + isinstance(expected, (int, float)) + and isinstance(actual, (int, float)) + and math.isclose(float(expected), float(actual), rel_tol=1e-6) + ) + else: + ok = expected == actual + if ok: + _passed += 1 + print(f" [PASS] {label}") + else: + _failed += 1 + print(f" [FAIL] {label}") + print(f" expected : {expected!r}") + print(f" actual : {actual!r}") + + +def section(title: str) -> None: + print(f"\n{'─' * 60}") + print(f" {title}") + print(f"{'─' * 60}") + + +def _resolve_crs_from_grid(grid_obj: Any, epc: Epc) -> Optional[Any]: + """ + Walk from a representation object to its CRS document. + + The CRS DOR is always present in ``LocalCrs`` — the difference between + RESQML versions is the depth of the path: + + * **v2.2** : ``Geometry.LocalCrs`` (``PointGeometry`` sits directly on the object) + * **v2.0.1**: ``Grid2dPatch.Geometry.LocalCrs`` (geometry is inside a patch sub-object) + + ``get_object_attribute_rgx`` resolves dot-delimited paths at exactly the + depth specified, so we try the shallower v2.2 path first, then fall back + to the deeper v2.0.1 path. + + Returns the resolved CRS object or ``None``. + """ + # v2.2: Geometry.LocalCrs (PointGeometry directly on the object) + dor = get_object_attribute_rgx(grid_obj, "[Gg]eometry.[Ll]ocal[_]?[Cc]rs") + + if dor is None: + # v2.0.1: Grid2dPatch.Geometry.LocalCrs (geometry wrapped in a patch) + dor = get_object_attribute_rgx( + grid_obj, + "[Gg]rid2[Dd][Pp]atch.[Gg]eometry.[Ll]ocal[_]?[Cc]rs", + ) + + if dor is None: + return None + + uuid = get_obj_uuid(dor) + if not uuid: + return None + + candidates = epc.get_object_by_uuid(uuid) + return candidates[0] if candidates else None + + +def resolve_crs_from_triangulated_set(triangulated_obj: Any, epc: Epc) -> List[Optional[Any]]: + """ + Walk from a TriangulatedSetRepresentation to its CRS document. + + Each patch of a TriangulatedSetRepresentation may reference a CRS via its + ``local_crs`` attribute. This function tries to resolve the first patch's CRS. + """ + dor = get_object_attribute_rgx(triangulated_obj, "triangle_patch.\d+.geometry.local_crs") + # print(f" Found DOR for TriangulatedSetRepresentation patch CRS: {dor}") + if dor is None: + return [] + + if isinstance(dor, list): + candidates = [] + for d in dor: + uuid = get_obj_uuid(d) + if uuid: + obj_candidates = epc.get_object_by_uuid(uuid) + print(f" Found DOR for TriangulatedSetRepresentation patch CRS: {d} → candidates: {len(obj_candidates)}") + candidates.append(obj_candidates[0] if obj_candidates else None) + return candidates + + return None + + +# =========================================================================== +# RESQML v2.0.1 — testingPackageCpp.epc +# =========================================================================== + +section("Loading testingPackageCpp.epc (RESQML v2.0.1)") +epc20 = Epc.read_file(EPC20_PATH) +print(f" Loaded {len(epc20.energyml_objects)} objects.") + +# ── LocalTime3DCrs ───────────────────────────────────────────────────────── + +section("v2.0.1 · LocalTime3DCrs (uuid dbd637d5…)") + +local_time_crs = epc20.get_object_by_uuid("dbd637d5-4528-4145-908b-5f7136824f6d")[0] + +# Test: extract without workspace (all data is inline for v2.0.1) +info: CrsInfo = extract_crs_info(local_time_crs) + +check("source_type", "LocalTime3DCrs", info.source_type) +check("x_offset", 1.0, info.x_offset, approx=True) +check("y_offset", 0.1, info.y_offset, approx=True) +check("z_offset", 15.0, info.z_offset, approx=True) +check("projected_uom (raw from xsdata enum)", "M", info.projected_uom) +check("vertical_uom (raw from xsdata enum)", "M", info.vertical_uom) +# ZIncreasingDownward=true in the file; VerticalUnknownCrs sub-object carries +# no direction field, so the sentinel correctly preserves the top-level value. +check("z_increasing_downward", True, info.z_increasing_downward) +check("areal_rotation_value", 0.0, info.areal_rotation_value, approx=True) +check("projected_epsg_code", None, info.projected_epsg_code) +check("vertical_epsg_code", None, info.vertical_epsg_code) +check("azimuth_reference", None, info.azimuth_reference) + +# ── LocalDepth3DCrs ──────────────────────────────────────────────────────── + +section("v2.0.1 · LocalDepth3DCrs (uuid 0ae56ef3…)") + +local_depth_crs = epc20.get_object_by_uuid("0ae56ef3-fc79-405b-8deb-6942e0f2e77c")[0] +info = extract_crs_info(local_depth_crs) + +check("source_type", "LocalDepth3DCrs", info.source_type) +check("projected_epsg_code", 23031, info.projected_epsg_code) +check("projected_uom", "M", info.projected_uom) +check("vertical_uom", "M", info.vertical_uom) +# ZIncreasingDownward=true in the raw file; the linked VerticalUnknownCrs +# carries no direction field, so the sentinel correctly preserves the value. +check("z_increasing_downward", True, info.z_increasing_downward) +check("x_offset", 0.0, info.x_offset, approx=True) +check("y_offset", 0.0, info.y_offset, approx=True) +check("z_offset", 0.0, info.z_offset, approx=True) + +# ── LocalEngineeringCompoundCrs (inside v2.0.1 EPC) ─────────────────────── +# This file mixes v2.0.1 and v2.3/v2.2 objects; the compound CRS is v2.3. + +section("v2.0.1 EPC · LocalEngineeringCompoundCrs (uuid 95330cec…)") + +compound_crs_20 = epc20.get_object_by_uuid("95330cec-164c-4165-9fb9-c56477ae7f8a")[0] + +# Without workspace: only inline z-axis info (no DOR resolution) +info_no_ws = extract_crs_info(compound_crs_20, workspace=None) +check("z_increasing_downward (inline VerticalAxis)", True, info_no_ws.z_increasing_downward) + +# With workspace: DORs resolved → full CRS info +info = extract_crs_info(compound_crs_20, workspace=epc20) +check("projected_epsg_code (resolved via DOR)", 23031, info.projected_epsg_code) +check("projected_uom", "M", info.projected_uom) +check("vertical_uom", "M", info.vertical_uom) +check("z_increasing_downward", True, info.z_increasing_downward) +check("azimuth_reference", "grid north", info.azimuth_reference) + +# ── LocalEngineering2DCrs (inside v2.0.1 EPC) ───────────────────────────── + +section("v2.0.1 EPC · LocalEngineering2DCrs (uuid 811f8e68…)") + +eng2d_crs_20 = epc20.get_object_by_uuid("811f8e68-c0e4-5f90-b9cf-03f7e3d53ca4")[0] +info = extract_crs_info(eng2d_crs_20) + +check("projected_epsg_code", 23031, info.projected_epsg_code) +check("projected_uom", "M", info.projected_uom) +check("vertical_uom", None, info.vertical_uom) # (none — 2D CRS has no Z) +check("z_increasing_downward", False, info.z_increasing_downward) +check("azimuth_reference", "grid north", info.azimuth_reference) + +# ── VerticalCrs (inside v2.0.1 EPC) ─────────────────────────────────────── + +section("v2.0.1 EPC · VerticalCrs (uuid 1f6cf904…)") + +vert_crs_20 = epc20.get_object_by_uuid("1f6cf904-336c-5202-a13d-7c9b142cd406")[0] +info = extract_crs_info(vert_crs_20) + +check("vertical_uom", "M", info.vertical_uom) +check("z_increasing_downward", True, info.z_increasing_downward) +check("projected_epsg_code", None, info.projected_epsg_code) # (vertical has none) +check("projected_uom", None, info.projected_uom) # (vertical has none) + +# ── Grid2DRepresentation → CRS (v2.0.1 approach) ───────────────────────── + +section("v2.0.1 · Grid2DRepresentation → CRS via geometry.local_crs DOR") + +# Grid 030a82f6 → LocalTime3DCrs (dbd637d5) +grid_time = epc20.get_object_by_uuid("030a82f6-10a7-4ecf-af03-54749e098624")[0] +resolved_crs = _resolve_crs_from_grid(grid_time, epc20) +check("resolved CRS type", "LocalTime3DCrs", type(resolved_crs).__name__ if resolved_crs else None) +if resolved_crs: + info = extract_crs_info(resolved_crs, workspace=epc20) + check(" x_offset", 1.0, info.x_offset, approx=True) + check(" y_offset", 0.1, info.y_offset, approx=True) + check(" z_offset", 15.0, info.z_offset, approx=True) + check(" projected_uom", "M", info.projected_uom) + +# Grid aa5b90f1 → LocalDepth3DCrs (0ae56ef3) +grid_depth = epc20.get_object_by_uuid("aa5b90f1-2eab-4fa6-8720-69dd4fd51a4d")[0] +resolved_crs = _resolve_crs_from_grid(grid_depth, epc20) +check("resolved CRS type", "LocalDepth3DCrs", type(resolved_crs).__name__ if resolved_crs else None) +if resolved_crs: + info = extract_crs_info(resolved_crs, workspace=epc20) + check(" projected_epsg_code", 23031, info.projected_epsg_code) + check(" projected_uom", "M", info.projected_uom) + # Same LocalDepth3DCrs — ZIncreasingDownward=true in the raw file. + check(" z_increasing_downward", True, info.z_increasing_downward) + +# Grid 4e56b0e4 → also LocalDepth3DCrs (same uuid) +grid_depth2 = epc20.get_object_by_uuid("4e56b0e4-2cd1-4efa-97dd-95f72bcf9f80")[0] +resolved_crs = _resolve_crs_from_grid(grid_depth2, epc20) +check("Grid 4e56b0e4 resolved CRS uuid", "0ae56ef3-fc79-405b-8deb-6942e0f2e77c", + getattr(resolved_crs, "uuid", None)) + + + +# =========================================================================== +# RESQML v2.2 / EML v2.3 — testingPackageCpp22.epc +# =========================================================================== + +section("Loading testingPackageCpp22.epc (RESQML v2.2 / EML v2.3)") +epc22 = Epc.read_file(EPC22_PATH) +print(f" Loaded {len(epc22.energyml_objects)} objects.") + +# ── LocalEngineering2DCrs (no EPSG, has offsets) ───────────────────────── + +section("v2.2 · LocalEngineering2DCrs (uuid 997796f5…) — offsets, no EPSG") + +eng2d_no_epsg = epc22.get_object_by_uuid("997796f5-da9d-5175-9fb7-e592957b73fb")[0] +info = extract_crs_info(eng2d_no_epsg) + +check("x_offset", 1.0, info.x_offset, approx=True) +check("y_offset", 0.1, info.y_offset, approx=True) +check("projected_uom", "M", info.projected_uom) +check("projected_epsg_code", None, info.projected_epsg_code) +check("azimuth_reference", "grid north", info.azimuth_reference) +check("z_increasing_downward", False, info.z_increasing_downward) + +# ── LocalEngineering2DCrs (with EPSG 23031) ────────────────────────────── + +section("v2.2 · LocalEngineering2DCrs (uuid 671ffdeb…) — EPSG 23031") + +eng2d_epsg = epc22.get_object_by_uuid("671ffdeb-f25c-513a-a4a2-1774d3ac20c6")[0] +info = extract_crs_info(eng2d_epsg) + +check("projected_epsg_code", 23031, info.projected_epsg_code) +check("projected_uom", "M", info.projected_uom) +check("azimuth_reference", "grid north", info.azimuth_reference) +check("z_increasing_downward", False, info.z_increasing_downward) + +# ── LocalEngineeringCompoundCrs (no EPSG, has offsets + z) ────────────── + +section("v2.2 · LocalEngineeringCompoundCrs (uuid f0e9f421…) — offsets + z offset") + +compound_no_epsg = epc22.get_object_by_uuid("f0e9f421-b902-4392-87d8-6495c02f2fbe")[0] + +# Without workspace: only inline VerticalAxis info available +info_no_ws = extract_crs_info(compound_no_epsg, workspace=None) +check("z_offset (inline origin_vertical_coordinate)", 15.0, info_no_ws.z_offset, approx=True) +check("z_increasing_downward (inline VerticalAxis)", True, info_no_ws.z_increasing_downward) +# This particular compound CRS mixes a time-domain vertical axis (uom='S') +# with a depth-domain resolved VerticalCrs (uom='M') — inline returns 'S' +check("vertical_uom (inline VerticalAxis — time domain)", "S", info_no_ws.vertical_uom) +check("x_offset without workspace", 0.0, info_no_ws.x_offset, approx=True) + +# With workspace: DORs resolved → horizontal CRS merged in +info = extract_crs_info(compound_no_epsg, workspace=epc22) +check("x_offset (from resolved LocalEngineering2DCrs)", 1.0, info.x_offset, approx=True) +check("y_offset (from resolved LocalEngineering2DCrs)", 0.1, info.y_offset, approx=True) +check("z_offset (inline)", 15.0, info.z_offset, approx=True) +check("projected_uom (from 2D CRS)", "M", info.projected_uom) +check("projected_epsg_code (2D CRS has none)", None, info.projected_epsg_code) +check("vertical_uom", "M", info.vertical_uom) +check("z_increasing_downward", True, info.z_increasing_downward) +check("azimuth_reference", "grid north", info.azimuth_reference) + +# ── LocalEngineeringCompoundCrs (EPSG 23031) ───────────────────────────── + +section("v2.2 · LocalEngineeringCompoundCrs (uuid 6a18c177…) — EPSG 23031") + +compound_epsg = epc22.get_object_by_uuid("6a18c177-93be-41ac-9084-f84bbb31f46d")[0] +info = extract_crs_info(compound_epsg, workspace=epc22) + +check("projected_epsg_code (resolved)", 23031, info.projected_epsg_code) +check("projected_uom", "M", info.projected_uom) +check("vertical_uom", "M", info.vertical_uom) +check("z_increasing_downward", True, info.z_increasing_downward) +check("x_offset", 0.0, info.x_offset, approx=True) +check("y_offset", 0.0, info.y_offset, approx=True) +check("z_offset", 0.0, info.z_offset, approx=True) +check("azimuth_reference", "grid north", info.azimuth_reference) + +# ── VerticalCrs (uuid 65cd199f) ────────────────────────────────────────── + +section("v2.2 · VerticalCrs (uuid 65cd199f…)") + +vert_crs_22a = epc22.get_object_by_uuid("65cd199f-156b-5112-ad3e-b4f54a2aa77b")[0] +info = extract_crs_info(vert_crs_22a) + +check("vertical_uom", "M", info.vertical_uom) +check("z_increasing_downward", True, info.z_increasing_downward) +check("projected_epsg_code (none for vertical)", None, info.projected_epsg_code) + +# ── VerticalCrs (uuid 355174db) ────────────────────────────────────────── + +section("v2.2 · VerticalCrs (uuid 355174db…)") + +vert_crs_22b = epc22.get_object_by_uuid("355174db-6226-57ae-a5a6-92f33825fed4")[0] +info = extract_crs_info(vert_crs_22b) + +check("vertical_uom", "M", info.vertical_uom) +check("z_increasing_downward", True, info.z_increasing_downward) + +# ── Grid2D v2.2 — CRS note ──────────────────────────────────────────────── +section("v2.2 · Grid2DRepresentation — CRS resolution") +print(""" + In RESQML v2.2, Grid2DRepresentation DOES embed a LocalCrs DOR, but at + a shallower path than v2.0.1: + + v2.2 : Geometry.LocalCrs (PointGeometry sits directly on the object) + v2.0.1: Grid2dPatch.Geometry.LocalCrs (geometry is wrapped in a patch sub-object) + + Both paths are resolved by trying the shallower v2.2 path first with + ``get_object_attribute_rgx``, then falling back to the deeper v2.0.1 path. + No indirect lookup through framework associations is needed. + + All LocalEngineeringCompoundCrs objects in this EPC: +""") + +for obj in epc22.energyml_objects: + if "localengineeringcompoundcrs" in type(obj).__name__.lower(): + info = extract_crs_info(obj, workspace=epc22) + print(f" CompoundCrs {obj.uuid}") + print(f" projected_epsg={info.projected_epsg_code} projected_uom={info.projected_uom}") + print(f" vertical_uom={info.vertical_uom} z_down={info.z_increasing_downward}") + print(f" offsets: x={info.x_offset} y={info.y_offset} z={info.z_offset}") + +# ── Grid2DRepresentation v2.2 → CRS via Geometry.LocalCrs ───────────────── + +section("v2.2 · Grid2DRepresentation (uuid 4e56b0e4) → CRS via Geometry.LocalCrs") + +grid22 = epc22.get_object_by_uuid("4e56b0e4-2cd1-4efa-97dd-95f72bcf9f80") +if grid22: + grid22 = grid22[0] + resolved_crs22 = _resolve_crs_from_grid(grid22, epc22) + check("resolved CRS type", "LocalEngineeringCompoundCrs", + type(resolved_crs22).__name__ if resolved_crs22 else None) + check("resolved CRS uuid", "6a18c177-93be-41ac-9084-f84bbb31f46d", + getattr(resolved_crs22, "uuid", None)) + if resolved_crs22: + info = extract_crs_info(resolved_crs22, workspace=epc22) + check(" projected_epsg_code", 23031, info.projected_epsg_code) + check(" projected_uom", "M", info.projected_uom) + check(" vertical_uom", "M", info.vertical_uom) + check(" z_increasing_downward", True, info.z_increasing_downward) + check(" x_offset", 0.0, info.x_offset, approx=True) + check(" y_offset", 0.0, info.y_offset, approx=True) + check(" z_offset", 0.0, info.z_offset, approx=True) +else: + print(" [SKIP] Grid 4e56b0e4 not found in testingPackageCpp22.epc") + + +# TriangulatedSetRepresentation 1a4112fa → LocalEngineeringCompoundCrs (6a18c177) +triangulated_set = epc22.get_object_by_uuid("1a4112fa-c4ef-4c8d-aed0-47d9273bebc5")[0] +resolved_crs_list = resolve_crs_from_triangulated_set(triangulated_set, epc22) +check("TriangulatedSetRepresentation resolved CRS uuid", 5, + len(resolved_crs_list)) + +for i, resolved_crs in enumerate(resolved_crs_list): + check(f"{i}) patch {i} resolved CRS type", "LocalEngineeringCompoundCrs", + type(resolved_crs).__name__ if resolved_crs else None) + if resolved_crs: + info = extract_crs_info(resolved_crs, workspace=epc22) + check(" projected_epsg_code (resolved)", 23031, info.projected_epsg_code) + check(" projected_uom", "M", info.projected_uom) + check(" vertical_uom", "M", info.vertical_uom) + check(" z_increasing_downward", True, info.z_increasing_downward) + check(" x_offset", 0.0, info.x_offset, approx=True) + check(" y_offset", 0.0, info.y_offset, approx=True) + check(" z_offset", 0.0, info.z_offset, approx=True) + check(" azimuth_reference", "grid north", info.azimuth_reference) + +# =========================================================================== +# Convenience helpers (delegates in helper.py) +# =========================================================================== + +section("Legacy helper delegates still work correctly") + +from energyml.utils.data.helper import ( + is_z_reversed, + get_projected_epsg_code, + get_projected_uom, + get_vertical_epsg_code, + get_crs_offsets_and_angle, +) + +depth_crs = epc20.get_object_by_uuid("0ae56ef3-fc79-405b-8deb-6942e0f2e77c")[0] +# ZIncreasingDownward=true in the raw file → is_z_reversed returns True. +check("is_z_reversed(LocalDepth3DCrs)", True, is_z_reversed(depth_crs)) +check("get_projected_epsg_code", 23031, get_projected_epsg_code(depth_crs)) +check("get_projected_uom", "M", get_projected_uom(depth_crs)) + +time_crs = epc20.get_object_by_uuid("dbd637d5-4528-4145-908b-5f7136824f6d")[0] +x, y, z, (angle, uom) = get_crs_offsets_and_angle(time_crs) +check("get_crs_offsets_and_angle x", 1.0, x, approx=True) +check("get_crs_offsets_and_angle y", 0.1, y, approx=True) +check("get_crs_offsets_and_angle z", 15.0, z, approx=True) + +# =========================================================================== +# Summary +# =========================================================================== + +section("Summary") +total = _passed + _failed +print(f" {_passed}/{total} checks passed.") +if _failed: + print(f" {_failed} checks FAILED — see [FAIL] lines above.") + sys.exit(1) +else: + print(" All checks passed!") diff --git a/energyml-utils/example/attic/dump_crs_objects.py b/energyml-utils/example/attic/dump_crs_objects.py new file mode 100644 index 0000000..7d956a6 --- /dev/null +++ b/energyml-utils/example/attic/dump_crs_objects.py @@ -0,0 +1,95 @@ +""" +Dump the raw JSON for every CRS (and Grid2D) object referenced by +``crs_info_from_epc.py``, so you can cross-check the expected values +in the integration script against what is actually stored in the EPC files. + +Run from the workspace root:: + + poetry run python example/attic/dump_crs_objects.py +""" +from __future__ import annotations + +import json +import logging +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) + +from energyml.utils.epc import Epc +from energyml.utils.serialization import serialize_json + +logging.basicConfig(level=logging.ERROR) + +_ROOT = Path(__file__).parent.parent.parent +EPC20_PATH = str(_ROOT / "rc" / "epc" / "testingPackageCpp.epc") +EPC22_PATH = str(_ROOT / "rc" / "epc" / "testingPackageCpp22.epc") + +# --------------------------------------------------------------------------- +# Objects to dump +# (epc_key, uuid, label) +# --------------------------------------------------------------------------- +OBJECTS_EPC20 = [ + ("dbd637d5-4528-4145-908b-5f7136824f6d", "LocalTime3DCrs"), + ("0ae56ef3-fc79-405b-8deb-6942e0f2e77c", "LocalDepth3DCrs"), + ("95330cec-164c-4165-9fb9-c56477ae7f8a", "LocalEngineeringCompoundCrs (v2.0.1 EPC)"), + ("811f8e68-c0e4-5f90-b9cf-03f7e3d53ca4", "LocalEngineering2DCrs (v2.0.1 EPC)"), + ("1f6cf904-336c-5202-a13d-7c9b142cd406", "VerticalCrs (v2.0.1 EPC)"), + ("030a82f6-10a7-4ecf-af03-54749e098624", "Grid2DRepresentation → LocalTime3DCrs"), + ("aa5b90f1-2eab-4fa6-8720-69dd4fd51a4d", "Grid2DRepresentation → LocalDepth3DCrs"), + ("4e56b0e4-2cd1-4efa-97dd-95f72bcf9f80", "Grid2DRepresentation (v2.0.1)"), +] + +OBJECTS_EPC22 = [ + ("997796f5-da9d-5175-9fb7-e592957b73fb", "LocalEngineering2DCrs (no EPSG)"), + ("671ffdeb-f25c-513a-a4a2-1774d3ac20c6", "LocalEngineering2DCrs (EPSG 23031)"), + ("f0e9f421-b902-4392-87d8-6495c02f2fbe", "LocalEngineeringCompoundCrs (no EPSG)"), + ("6a18c177-93be-41ac-9084-f84bbb31f46d", "LocalEngineeringCompoundCrs (EPSG 23031)"), + ("65cd199f-156b-5112-ad3e-b4f54a2aa77b", "VerticalCrs-A — Direction=down → z_down=True"), + ("355174db-6226-57ae-a5a6-92f33825fed4", "VerticalCrs-B — Direction=down → z_down=True"), + ("4e56b0e4-2cd1-4efa-97dd-95f72bcf9f80", "Grid2DRepresentation (v2.2)"), + ("1a4112fa-c4ef-4c8d-aed0-47d9273bebc5", "TriangulatedSetRepresentation (v2.2)"), +] + +# --------------------------------------------------------------------------- + +def _sep(title: str) -> None: + print(f"\n{'═' * 70}") + print(f" {title}") + print(f"{'═' * 70}") + + +def _dump(epc: Epc, uuid: str, label: str) -> None: + print(f"\n── {label} [{uuid}]") + candidates = epc.get_object_by_uuid(uuid) + if not candidates: + print(" *** NOT FOUND ***") + return + obj = candidates[0] + print(f" type : {type(obj).__module__}.{type(obj).__name__}") + try: + raw = json.loads(serialize_json(obj)) + # Pretty-print, indented 4 spaces relative to the bullet + text = json.dumps(raw, indent=2, ensure_ascii=False) + for line in text.splitlines(): + print(f" {line}") + except Exception as exc: + print(f" *** serialization error: {exc} ***") + + +def main() -> None: + _sep(f"EPC 2.0.1 — {EPC20_PATH}") + epc20 = Epc.read_file(EPC20_PATH) + print(f" Loaded {len(epc20.energyml_objects)} objects.") + for uuid, label in OBJECTS_EPC20: + _dump(epc20, uuid, label) + + _sep(f"EPC 2.2 — {EPC22_PATH}") + epc22 = Epc.read_file(EPC22_PATH) + print(f" Loaded {len(epc22.energyml_objects)} objects.") + for uuid, label in OBJECTS_EPC22: + _dump(epc22, uuid, label) + + +if __name__ == "__main__": + main() diff --git a/energyml-utils/example/epc_rels_management_example.py b/energyml-utils/example/attic/epc_rels_management_example.py similarity index 100% rename from energyml-utils/example/epc_rels_management_example.py rename to energyml-utils/example/attic/epc_rels_management_example.py diff --git a/energyml-utils/example/epc_stream_keep_open_example.py b/energyml-utils/example/attic/epc_stream_keep_open_example.py similarity index 100% rename from energyml-utils/example/epc_stream_keep_open_example.py rename to energyml-utils/example/attic/epc_stream_keep_open_example.py diff --git a/energyml-utils/example/main.py b/energyml-utils/example/attic/main.py similarity index 99% rename from energyml-utils/example/main.py rename to energyml-utils/example/attic/main.py index 4313ed5..d379d40 100644 --- a/energyml-utils/example/main.py +++ b/energyml-utils/example/attic/main.py @@ -91,7 +91,7 @@ validate_epc, correct_dor, ) -from energyml.utils.xml import ( +from energyml.utils.xml_utils import ( find_schema_version_in_element, get_class_name_from_xml, get_root_namespace, diff --git a/energyml-utils/example/main201.py b/energyml-utils/example/attic/main201.py similarity index 100% rename from energyml-utils/example/main201.py rename to energyml-utils/example/attic/main201.py diff --git a/energyml-utils/example/main_data.py b/energyml-utils/example/attic/main_data.py similarity index 95% rename from energyml-utils/example/main_data.py rename to energyml-utils/example/attic/main_data.py index 52ff8ee..10ad492 100644 --- a/energyml-utils/example/main_data.py +++ b/energyml-utils/example/attic/main_data.py @@ -11,11 +11,11 @@ ) from energyml.utils.data.export import export_obj -from src.energyml.utils.data.helper import ( +from energyml.utils.data.helper import ( get_array_reader_function, read_array, ) -from src.energyml.utils.data.mesh import ( +from energyml.utils.data.mesh import ( GeoJsonGeometryType, MeshFileFormat, _create_shape, @@ -24,29 +24,29 @@ export_off, read_mesh_object, ) -from src.energyml.utils.epc import gen_energyml_object_path -from src.energyml.utils.introspection import ( +from energyml.utils.epc import gen_energyml_object_path +from energyml.utils.introspection import ( get_object_attribute, is_abstract, get_obj_uuid, search_attribute_matching_name_with_path, ) -from src.energyml.utils.manager import get_sub_classes -from src.energyml.utils.serialization import ( +from energyml.utils.manager import get_sub_classes +from energyml.utils.serialization import ( read_energyml_xml_file, read_energyml_xml_str, read_energyml_xml_bytes, read_energyml_xml_tree, ) -from src.energyml.utils.validation import validate_epc -from src.energyml.utils.xml import get_tree -from src.energyml.utils.data.datasets_io import ( +from energyml.utils.validation import validate_epc +from energyml.utils.xml_utils import get_tree +from energyml.utils.data.datasets_io import ( HDF5FileReader, get_path_in_external_with_path, get_external_file_path_from_external_path, ) from energyml.utils.epc import Epc -from src.energyml.utils.data.mesh import ( +from energyml.utils.data.mesh import ( read_polyline_representation, read_point_representation, read_grid2d_representation, @@ -165,7 +165,8 @@ def read_h5_polyline(): def read_h5_grid2d_bis(): - path = "../rc/obj_Grid2dRepresentation_7c43bad9-4cad-4ab0-bb50-9afb24a4b883.xml" + path = "rc/obj_Grid2dRepresentation_7c43bad9-4cad-4ab0-bb50-9afb24a4b883.xml" + # path = "../rc/obj_Grid2dRepresentation_7c43bad9-4cad-4ab0-bb50-9afb24a4b883.xml" xml_content = "" with open(path, "r") as f: @@ -179,12 +180,12 @@ def read_h5_grid2d_bis(): ) uuid = get_obj_uuid(grid) print("Exporting") - with open(f"result/grid2d_{uuid}.obj", "wb") as f: + with open(f"rc/result/grid2d_{uuid}.obj", "wb") as f: export_obj( mesh_list=grid_list, out=f, ) - with open(f"result/grid2d_{uuid}_bis.off", "wb") as f: + with open(f"rc/result/grid2d_{uuid}_bis.off", "wb") as f: export_off( mesh_list=grid_list, out=f, @@ -206,12 +207,12 @@ def read_h5_grid2d_ter(): ) uuid = get_obj_uuid(grid) print("Exporting") - with open(f"result/grid2d_{uuid}.obj", "wb") as f: + with open(f"rc/result/grid2d_{uuid}.obj", "wb") as f: export_obj( mesh_list=grid_list, out=f, ) - with open(f"result/grid2d_{uuid}_bis.off", "wb") as f: + with open(f"rc/result/grid2d_{uuid}_bis.off", "wb") as f: export_off( mesh_list=grid_list, out=f, @@ -248,12 +249,12 @@ def read_h5_grid2d(): # keep_holes=False ) print("Exporting") - with open(f"result/grid2d_{uuid}.obj", "wb") as f: + with open(f"rc/result/grid2d_{uuid}.obj", "wb") as f: export_obj( mesh_list=grid_list, out=f, ) - with open(f"result/grid2d_{uuid}.off", "wb") as f: + with open(f"rc/result/grid2d_{uuid}.off", "wb") as f: export_off( mesh_list=grid_list, out=f, @@ -272,12 +273,12 @@ def read_meshes(): workspace=epc22, ) print("Exporting") - with open(f"result/{gen_energyml_object_path(energyml_obj)}.obj", "wb") as f: + with open(f"rc/result/{gen_energyml_object_path(energyml_obj)}.obj", "wb") as f: export_obj( mesh_list=mesh_list, out=f, ) - with open(f"result/{gen_energyml_object_path(energyml_obj)}.off", "wb") as f: + with open(f"rc/result/{gen_energyml_object_path(energyml_obj)}.off", "wb") as f: export_off( mesh_list=mesh_list, out=f, diff --git a/energyml-utils/example/main_datasets.py b/energyml-utils/example/attic/main_datasets.py similarity index 100% rename from energyml-utils/example/main_datasets.py rename to energyml-utils/example/attic/main_datasets.py diff --git a/energyml-utils/example/main_stream.py b/energyml-utils/example/attic/main_stream.py similarity index 99% rename from energyml-utils/example/main_stream.py rename to energyml-utils/example/attic/main_stream.py index 87f529a..db354d0 100644 --- a/energyml-utils/example/main_stream.py +++ b/energyml-utils/example/attic/main_stream.py @@ -13,7 +13,7 @@ from energyml.utils.introspection import get_obj_uri from energyml.utils.constants import EpcExportVersion -from energyml.utils.epc_stream import read_epc_stream +from energyml.utils.epc_stream_old import read_epc_stream from energyml.utils.epc import ( Epc, create_energyml_object, diff --git a/energyml-utils/example/attic/main_stream_sample.py b/energyml-utils/example/attic/main_stream_sample.py new file mode 100644 index 0000000..7afc13f --- /dev/null +++ b/energyml-utils/example/attic/main_stream_sample.py @@ -0,0 +1,691 @@ +import os +import shutil +import sys +import logging +from energyml.utils.epc_stream import EpcStreamReader, RelsUpdateMode +from energyml.eml.v2_3.commonv2 import Citation, ExternalDataArrayPart +from energyml.resqml.v2_2.resqmlv2 import ( + TriangulatedSetRepresentation, + BoundaryFeatureInterpretation, + BoundaryFeature, + HorizonInterpretation, + TrianglePatch, + IntegerExternalArray, + ExternalDataArray, + PointGeometry, + Point3DExternalArray, +) + +from energyml.resqml.v2_0_1.resqmlv2 import TrianglePatch as TrianglePatchV2_0_1 +from energyml.utils.introspection import epoch_to_date, epoch +from energyml.utils.epc import as_dor, gen_uuid, get_obj_identifier +from energyml.utils.constants import EPCRelsRelationshipType, MimeType + +from energyml.opc.opc import Relationship +from energyml.utils.data.datasets_io import get_handler_registry +import numpy as np + + +CONST_H5_PATH = "external_data.h5" +CONST_CSV_PATH = "external_data.csv" +CONST_PARQUET_PATH = "external_data.parquet" +CONST_LAS_PATH = "external_data.las" +CONST_SEGY_PATH = "external_data.sgy" + + +def sample_objects(): + """Create sample EnergyML objects for testing.""" + # Create a BoundaryFeature + bf = BoundaryFeature( + citation=Citation( + title="Test Boundary Feature", + originator="Test", + creation=epoch_to_date(epoch()), + ), + uuid="25773477-ffee-4cc2-867d-000000000001", + object_version="1.0", + ) + + # Create a BoundaryFeatureInterpretation + bfi = BoundaryFeatureInterpretation( + citation=Citation( + title="Test Boundary Feature Interpretation", + originator="Test", + creation=epoch_to_date(epoch()), + ), + uuid="25773477-ffee-4cc2-867d-000000000002", + object_version="1.0", + interpreted_feature=as_dor(bf), + ) + + # Create a HorizonInterpretation (independent object) + horizon_interp = HorizonInterpretation( + citation=Citation( + title="Test HorizonInterpretation", + originator="Test", + creation=epoch_to_date(epoch()), + ), + interpreted_feature=as_dor(bf), + uuid="25773477-ffee-4cc2-867d-000000000003", + object_version="1.0", + domain="depth", + ) + + # Create a TriangulatedSetRepresentation + trset_uuid = "25773477-ffee-4cc2-867d-000000000004" + trset = TriangulatedSetRepresentation( + citation=Citation( + title="Test TriangulatedSetRepresentation", + originator="Test", + creation=epoch_to_date(epoch()), + ), + uuid="25773477-ffee-4cc2-867d-000000000004", + object_version="1.0", + represented_object=as_dor(horizon_interp), + triangle_patch=[ + TrianglePatch( + node_count=3, + triangles=IntegerExternalArray( + values=ExternalDataArray( + external_data_array_part=[ + ExternalDataArrayPart( + count=[6], + path_in_external_file=f"/RESQML/{trset_uuid}/triangles", + uri=CONST_H5_PATH, + mime_type=str(MimeType.HDF5), + ) + ] + ) + ), + geometry=PointGeometry( + points=Point3DExternalArray( + coordinates=ExternalDataArray( + external_data_array_part=[ + ExternalDataArrayPart( + count=[9], + path_in_external_file=f"/RESQML/{trset_uuid}/points", + uri=CONST_CSV_PATH, + mime_type=str(MimeType.CSV), + ) + ] + ) + ), + ), + ) + ], + ) + + return { + "bf": bf, + "bfi": bfi, + "trset": trset, + "horizon_interp": horizon_interp, + } + + +def main(epc_file_path: str): + epc = EpcStreamReader( + epc_file_path=epc_file_path, enable_parallel_rels=True, rels_update_mode=RelsUpdateMode.UPDATE_AT_MODIFICATION + ) + + # logging.info(epc.get_statistics()) + + for obj in epc.list_objects(): + logging.info(f"Object: {obj}") + + +def test_create_epc(path: str): + # delete file if exists + if os.path.exists(path): + os.remove(path) + + # Calculate the EPC directory for cleanup + epc_dir = os.path.dirname(path) if os.path.dirname(path) else "." + + # Clean up old external files if they exist (to avoid stale data) + for old_file in [ + os.path.join(epc_dir, CONST_H5_PATH), + os.path.join(epc_dir, CONST_CSV_PATH), + os.path.join(epc_dir, CONST_PARQUET_PATH), + os.path.join(epc_dir, CONST_LAS_PATH), + os.path.join(epc_dir, CONST_SEGY_PATH), + ]: + if os.path.exists(old_file): + os.remove(old_file) + logging.info(f"Cleaned up old external file: {old_file}") + + logging.info(f"==> Creating new EPC at {path}...") + epc = EpcStreamReader(epc_file_path=path, rels_update_mode=RelsUpdateMode.UPDATE_AT_MODIFICATION) + + data = sample_objects() + + logging.info("==> Creating sample objects and adding to EPC...") + + logging.info("==> Adding horizon interpretation") + epc.add_object(data["horizon_interp"]) + logging.info(f"horizon rels : {epc.get_obj_rels(data['horizon_interp'])}") + + logging.info("==> Adding boundary feature") + epc.add_object(data["bf"]) + logging.info(f"boundary feature rels : {epc.get_obj_rels(data['bf'])}") + + logging.info("==> Adding boundary feature interpretation") + epc.add_object(data["bfi"]) + logging.info("==> Adding triangulated set representation") + epc.add_object(data["trset"]) + + # Debug: Print all metadata identifiers + logging.info(f"==> All metadata identifiers: {list(epc._metadata_mgr._metadata.keys())}") + + logging.info("==> All objects added. Closing EPC to write to disk.") + + horizon_id = get_obj_identifier(data["horizon_interp"]) + logging.info(f"==> Horizon identifier: {horizon_id}") + logging.info(f"==> Horizon in metadata: {horizon_id in epc._metadata_mgr._metadata}") + + # Debug: Test _id_from_uri_or_identifier + resolved_id = epc._id_from_uri_or_identifier(data["horizon_interp"]) + logging.info(f"==> Resolved ID from object: {resolved_id}") + logging.info( + f"==> Resolved ID in metadata: {resolved_id in epc._metadata_mgr._metadata if resolved_id else 'ID is None'}" + ) + + horizon_rels = epc.get_obj_rels(data["horizon_interp"]) + assert ( + len(horizon_rels) == 2 + ), f"Expected 2 relationships in horizon rels since both bfi and trset should refer to horizon as interpreted feature {horizon_rels}" + epc.close() + + epc_reopen = EpcStreamReader(epc_file_path=path, rels_update_mode=RelsUpdateMode.UPDATE_AT_MODIFICATION) + + horizon_rels = epc_reopen.get_obj_rels(data["horizon_interp"]) + assert ( + len(horizon_rels) == 2 + ), f"Expected 2 relationships in horizon rels since both bfi and trset should refer to horizon as interpreted feature {horizon_rels}" + + logging.info("==> Reopened EPC, listing objects:") + for obj in epc_reopen.list_objects(): + logging.info(f"Object: {obj}") + obj_rels = epc_reopen.get_obj_rels(obj) + logging.info(f"\tObject rels: {obj_rels}") + dest_rels = [r for r in obj_rels if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT)] + logging.info(f"\tObject DESTINATION rels: {dest_rels}") + + # remove trset to check if horizon has no more source rels + epc_reopen.remove_object(data["trset"]) + + horizon_rels_after_removal = epc_reopen.get_obj_rels(data["horizon_interp"]) + logging.info(f"Horizon interpretation rels after removing trset: {horizon_rels_after_removal}") + source_rels_after_removal = [ + r for r in horizon_rels_after_removal if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT) + ] + logging.info(f"Horizon interpretation SOURCE rels after removing trset: {source_rels_after_removal}") + assert ( + len(source_rels_after_removal) == 0 + ), "Expected no SOURCE relationships in horizon rels after removing trset since trset was the only destination referring to horizon" + + assert ( + len(horizon_rels_after_removal) == 1 + ), "Expected 1 relationship in horizon rels after removing trset since bfi should still refer to horizon as interpreted feature" + + epc_reopen.close() + + +def test_create_epc_v2(path: str): + + if os.path.exists(path): + os.remove(path) + logging.info(f"==> Creating new EPC at {path}...") + epc = EpcStreamReader(epc_file_path=path, rels_update_mode=RelsUpdateMode.UPDATE_AT_MODIFICATION) + + data = sample_objects() + + epc.add_object(data["bf"]) + # epc.add_object(data["bfi"]) + epc.add_object(data["horizon_interp"]) + epc.add_object(data["trset"]) + + hi_rels = epc.get_obj_rels(data["horizon_interp"]) + + logging.info(f"Horizon interpretation rels: {hi_rels}") + + +def test_create_epc_v3_with_different_external_files(path: str): + # Define interesting test arrays with edge cases: 2D arrays with null values, zeros, negatives, special values + # HDF5 test array: Integer triangles with zeros and varied values (2D: 3 triangles x 3 vertices) + h5_triangles = np.array([[0, 1, 2], [2, 3, 0], [-1, 4, 5]], dtype=np.int32) # Including negative value and zero + + # CSV test array: 3D coordinates with NaN values (2D: 5 points x 3 coords) + csv_points = np.array( + [ + [0.0, 0.0, 0.0], + [1.0, np.nan, 0.0], # NaN value + [1.0, 1.0, 0.0], + [0.0, 1.0, np.nan], # Another NaN + [0.5, 0.5, -1.5], # Negative value + ], + dtype=np.float32, + ) + + # Parquet test array: Normals with special float values (2D: 4 points x 3 components) + parquet_normals = np.array( + [ + [0.0, 0.0, 1.0], + [np.inf, 0.0, 0.0], # Positive infinity + [-np.inf, 0.0, 0.0], # Negative infinity + [0.0, np.nan, 1.0], # NaN value + ], + dtype=np.float32, + ) + + # LAS test array: Well log data with null values (2D: 10 depth points x 3 curves) + las_well_log = np.array( + [ + [1000.0, 75.5, 2.35], + [1001.0, 80.2, 2.40], + [1002.0, np.nan, 2.38], # Missing GR value + [1003.0, 85.1, np.nan], # Missing RHOB value + [1004.0, 90.0, 2.42], + [1005.0, 0.0, 2.45], # Zero GR (valid but unusual) + [1006.0, 95.5, 2.50], + [1007.0, np.nan, np.nan], # Multiple nulls + [1008.0, 100.0, 2.55], + [1009.0, -10.5, 2.60], # Negative value (calibration artifact) + ], + dtype=np.float32, + ) + + # SEG-Y test array: Seismic traces with various edge cases (2D: 5 traces x 8 samples) + segy_seismic = np.array( + [ + [0.0, 0.5, 1.0, 0.5, 0.0, -0.5, -1.0, -0.5], + [1.0, 0.8, 0.6, 0.4, 0.2, 0.0, -0.2, -0.4], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # Silent trace (all zeros) + [-1.0, -0.8, -0.6, -0.4, -0.2, 0.0, 0.2, 0.4], + [np.nan, 0.1, 0.2, 0.3, np.nan, 0.5, 0.6, np.nan], # Traces with NaN (dead traces) + ], + dtype=np.float32, + ) + + if os.path.exists(path): + os.remove(path) + + # Calculate the EPC directory + epc_dir = os.path.dirname(path) if os.path.dirname(path) else "." + + # Clean up old external files if they exist (to avoid stale data) + for old_file in [ + os.path.join(epc_dir, CONST_H5_PATH), + os.path.join(epc_dir, CONST_CSV_PATH), + os.path.join(epc_dir, CONST_PARQUET_PATH), + os.path.join(epc_dir, CONST_LAS_PATH), + os.path.join(epc_dir, CONST_SEGY_PATH), + ]: + if os.path.exists(old_file): + os.remove(old_file) + logging.info(f"Cleaned up old external file: {old_file}") + + logging.info(f"==> Creating new EPC at {path}...") + epc = EpcStreamReader(epc_file_path=path, rels_update_mode=RelsUpdateMode.UPDATE_AT_MODIFICATION) + + data = sample_objects() + + epc.add_object(data["bf"]) + epc.add_object(data["horizon_interp"]) + tr_set_id = epc.add_object(data["trset"]) + + hi_rels = epc.get_obj_rels(data["horizon_interp"]) + logging.info(f"Horizon interpretation rels: {hi_rels}") + + # ========== HDF5 Test ========== + logging.info("\n" + "=" * 60) + logging.info("==> Testing HDF5 format...") + h5_file_path = "wip/notARealFile.h5" + h5_path_in_external = f"/RESQML/{tr_set_id}/triangles" + epc.add_rels_for_object( + tr_set_id, + relationships=[Relationship(type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), target=h5_file_path)], + ) + epc.write_array( + proxy=tr_set_id, + path_in_external=h5_path_in_external, + array=h5_triangles, + external_uri=CONST_H5_PATH, + ) + logging.info(f"Written HDF5 array shape: {h5_triangles.shape}, dtype: {h5_triangles.dtype}") + logging.info(f"HDF5 test array content:\n{h5_triangles}") + + # ========== CSV Test ========== + logging.info("\n" + "=" * 60) + logging.info("==> Testing CSV format...") + csv_path_in_external = f"/RESQML/{tr_set_id}/points" + epc.write_array( + proxy=tr_set_id, + path_in_external=csv_path_in_external, + array=csv_points, + external_uri=CONST_CSV_PATH, + ) + logging.info(f"Written CSV array shape: {csv_points.shape}, dtype: {csv_points.dtype}") + logging.info(f"CSV test array content:\n{csv_points}") + + # ========== Parquet Test ========== + logging.info("\n" + "=" * 60) + logging.info("==> Testing Parquet format...") + parquet_file_path = "wip/test_data.parquet" + parquet_path_in_external = f"/RESQML/{tr_set_id}/normals" + epc.add_rels_for_object( + tr_set_id, + relationships=[ + Relationship(type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), target=parquet_file_path) + ], + ) + epc.write_array( + proxy=tr_set_id, + path_in_external=parquet_path_in_external, + array=parquet_normals, + external_uri=CONST_PARQUET_PATH, + ) + logging.info(f"Written Parquet array shape: {parquet_normals.shape}, dtype: {parquet_normals.dtype}") + logging.info(f"Parquet test array content:\n{parquet_normals}") + + # ========== LAS Test ========== + logging.info("\n" + "=" * 60) + logging.info("==> Testing LAS format...") + las_file_path = "wip/test_well_log.las" + las_path_in_external = "DEPTH,GR,RHOB" # LAS mnemonics + epc.add_rels_for_object( + tr_set_id, + relationships=[Relationship(type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), target=las_file_path)], + ) + epc.write_array( + proxy=tr_set_id, + path_in_external=las_path_in_external, + array=las_well_log, + external_uri=CONST_LAS_PATH, + ) + logging.info(f"Written LAS array shape: {las_well_log.shape}, dtype: {las_well_log.dtype}") + logging.info(f"LAS test array content:\n{las_well_log}") + + # ========== SEG-Y Test ========== + logging.info("\n" + "=" * 60) + logging.info("==> Testing SEG-Y format...") + segy_file_path = "wip/test_seismic.sgy" + segy_path_in_external = "traces" # SEG-Y standard path + epc.add_rels_for_object( + tr_set_id, + relationships=[Relationship(type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), target=segy_file_path)], + ) + epc.write_array( + proxy=tr_set_id, + path_in_external=segy_path_in_external, + array=segy_seismic, + external_uri=CONST_SEGY_PATH, + ) + logging.info(f"Written SEG-Y array shape: {segy_seismic.shape}, dtype: {segy_seismic.dtype}") + logging.info(f"SEG-Y test array content:\n{segy_seismic}") + + logging.info("\n" + "=" * 60) + logging.info("==> Successfully wrote data to all supported file formats:") + logging.info(f" - HDF5: {CONST_H5_PATH}") + logging.info(f" - CSV: {CONST_CSV_PATH}") + logging.info(f" - Parquet: {CONST_PARQUET_PATH}") + logging.info(f" - LAS: {CONST_LAS_PATH}") + logging.info(f" - SEG-Y: {CONST_SEGY_PATH}") + + # ========== Read Back and Verify ========== + logging.info("\n" + "#" * 60) + logging.info("### VERIFICATION: Reading back arrays and comparing ###") + logging.info("#" * 60) + + registry = get_handler_registry() + verification_passed = True + + # Construct full paths to external files (relative to EPC location) + h5_full_path = os.path.join(epc_dir, CONST_H5_PATH) + csv_full_path = os.path.join(epc_dir, CONST_CSV_PATH) + parquet_full_path = os.path.join(epc_dir, CONST_PARQUET_PATH) + las_full_path = os.path.join(epc_dir, CONST_LAS_PATH) + segy_full_path = os.path.join(epc_dir, CONST_SEGY_PATH) + + logging.info(f"Reading files from EPC directory: {epc_dir}") + logging.info(f" - HDF5: {h5_full_path}") + logging.info(f" - CSV: {csv_full_path}") + logging.info(f" - Parquet: {parquet_full_path}") + logging.info(f" - LAS: {las_full_path}") + logging.info(f" - SEG-Y: {segy_full_path}") + + def arrays_equal(arr1, arr2, name): + """Compare two arrays handling NaN, inf, and other special values.""" + try: + # Check shapes first + if arr1.shape != arr2.shape: + logging.error(f"[{name}] Shape mismatch: {arr1.shape} != {arr2.shape}") + # Try to reshape if total size matches + if arr1.size == arr2.size: + logging.warning(f"[{name}] Arrays have same total size ({arr1.size}), attempting reshape...") + try: + arr2_reshaped = arr2.reshape(arr1.shape) + logging.info(f"[{name}] Reshape successful, comparing reshaped arrays...") + return arrays_equal(arr1, arr2_reshaped, name + " (reshaped)") + except Exception as reshape_err: + logging.error(f"[{name}] Reshape failed: {reshape_err}") + return False + + # Check dtypes + if arr1.dtype != arr2.dtype: + logging.warning( + f"[{name}] Dtype difference: {arr1.dtype} != {arr2.dtype} (attempting comparison anyway)" + ) + + # Use numpy's array_equal which handles NaN properly with equal_nan=True + are_equal = np.array_equal(arr1, arr2, equal_nan=True) + + if not are_equal: + # Provide detailed difference information + try: + diff_mask = ~np.isclose(arr1, arr2, equal_nan=True, rtol=1e-5, atol=1e-8) + n_diff = np.sum(diff_mask) + + if n_diff == 0: + # Arrays are actually equal (dtype conversion issue) + logging.info(f"[{name}] Arrays are equal (dtype conversion handled)") + return True + + logging.error(f"[{name}] Arrays differ in {n_diff} elements") + if n_diff < 20: # Only show details if not too many differences + logging.error( + f"[{name}] Differences:\nExpected:\n{arr1[diff_mask]}\nActual:\n{arr2[diff_mask]}" + ) + except Exception as diff_err: + logging.error(f"[{name}] Could not compute differences: {diff_err}") + return False + + return True + except Exception as e: + logging.error(f"[{name}] Comparison error: {e}") + return False + + # --- HDF5 Verification --- + logging.info("\n" + "=" * 60) + logging.info("==> Verifying HDF5 format...") + h5_handler = registry.get_handler_for_file(h5_full_path) + if h5_handler: + # Get metadata + h5_metadata = h5_handler.get_array_metadata(h5_full_path, h5_path_in_external) + logging.info(f"HDF5 Metadata: {h5_metadata}") + + # Read back + h5_read_back = h5_handler.read_array(h5_full_path, h5_path_in_external) + if h5_read_back is not None: + logging.info(f"Read back HDF5 array shape: {h5_read_back.shape}, dtype: {h5_read_back.dtype}") + logging.info(f"Read back HDF5 content:\n{h5_read_back}") + + # Verify + if arrays_equal(h5_triangles, h5_read_back, "HDF5"): + logging.info("✓ HDF5 verification PASSED") + else: + logging.error("✗ HDF5 verification FAILED") + verification_passed = False + else: + logging.error("✗ HDF5 read returned None") + verification_passed = False + else: + logging.error("✗ HDF5 handler not available") + verification_passed = False + + # --- CSV Verification --- + logging.info("\n" + "=" * 60) + logging.info("==> Verifying CSV format...") + csv_handler = registry.get_handler_for_file(csv_full_path) + if csv_handler: + # Get metadata + csv_metadata = csv_handler.get_array_metadata(csv_full_path) + logging.info(f"CSV Metadata: {csv_metadata}") + + # Read back + csv_read_back = csv_handler.read_array(csv_full_path) + if csv_read_back is not None: + logging.info(f"Read back CSV array shape: {csv_read_back.shape}, dtype: {csv_read_back.dtype}") + logging.info(f"Read back CSV content:\n{csv_read_back}") + + # Verify + if arrays_equal(csv_points, csv_read_back, "CSV"): + logging.info("✓ CSV verification PASSED") + else: + logging.error("✗ CSV verification FAILED") + verification_passed = False + else: + logging.error("✗ CSV read returned None") + verification_passed = False + else: + logging.error("✗ CSV handler not available") + verification_passed = False + + # --- Parquet Verification --- + logging.info("\n" + "=" * 60) + logging.info("==> Verifying Parquet format...") + parquet_handler = registry.get_handler_for_file(parquet_full_path) + if parquet_handler: + # Get metadata + parquet_metadata = parquet_handler.get_array_metadata(parquet_full_path) + logging.info(f"Parquet Metadata: {parquet_metadata}") + + # Read back + parquet_read_back = parquet_handler.read_array(parquet_full_path) + if parquet_read_back is not None: + logging.info(f"Read back Parquet array shape: {parquet_read_back.shape}, dtype: {parquet_read_back.dtype}") + logging.info(f"Read back Parquet content:\n{parquet_read_back}") + + # Verify + if arrays_equal(parquet_normals, parquet_read_back, "Parquet"): + logging.info("✓ Parquet verification PASSED") + else: + logging.error("✗ Parquet verification FAILED") + verification_passed = False + else: + logging.error("✗ Parquet read returned None") + verification_passed = False + else: + logging.error("✗ Parquet handler not available") + verification_passed = False + + # --- LAS Verification --- + logging.info("\n" + "=" * 60) + logging.info("==> Verifying LAS format...") + las_handler = registry.get_handler_for_file(las_full_path) + if las_handler: + # Get metadata + las_metadata = las_handler.get_array_metadata(las_full_path) + logging.info(f"LAS Metadata: {las_metadata}") + + # Read back + las_read_back = las_handler.read_array(las_full_path, las_path_in_external) + if las_read_back is not None: + logging.info(f"Read back LAS array shape: {las_read_back.shape}, dtype: {las_read_back.dtype}") + logging.info(f"Read back LAS content:\n{las_read_back}") + + # Verify + if arrays_equal(las_well_log, las_read_back, "LAS"): + logging.info("✓ LAS verification PASSED") + else: + logging.error("✗ LAS verification FAILED") + verification_passed = False + else: + logging.error("✗ LAS read returned None") + verification_passed = False + else: + logging.error("✗ LAS handler not available") + verification_passed = False + + # --- SEG-Y Verification --- + logging.info("\n" + "=" * 60) + logging.info("==> Verifying SEG-Y format...") + segy_handler = registry.get_handler_for_file(segy_full_path) + if segy_handler: + # Get metadata + segy_metadata = segy_handler.get_array_metadata(segy_full_path) + logging.info(f"SEG-Y Metadata: {segy_metadata}") + + # Read back + segy_read_back = segy_handler.read_array(segy_full_path, segy_path_in_external) + if segy_read_back is not None: + logging.info(f"Read back SEG-Y array shape: {segy_read_back.shape}, dtype: {segy_read_back.dtype}") + logging.info(f"Read back SEG-Y content:\n{segy_read_back}") + + # Verify + if arrays_equal(segy_seismic, segy_read_back, "SEG-Y"): + logging.info("✓ SEG-Y verification PASSED") + else: + logging.error("✗ SEG-Y verification FAILED") + verification_passed = False + else: + logging.error("✗ SEG-Y read returned None") + verification_passed = False + else: + logging.error("✗ SEG-Y handler not available") + verification_passed = False + + # Final summary + logging.info("\n" + "#" * 60) + if verification_passed: + logging.info("### ✓✓✓ ALL VERIFICATIONS PASSED ✓✓✓ ###") + else: + logging.error("### ✗✗✗ SOME VERIFICATIONS FAILED ✗✗✗ ###") + logging.info("#" * 60) + + # Close and verify + epc.close() + logging.info("==> EPC file closed successfully") + + +def recompute_rels(path: str): + EpcStreamReader(epc_file_path=path, enable_parallel_rels=True, rels_update_mode=RelsUpdateMode.UPDATE_ON_CLOSE) + + +def recompute_rels_change_name(path: str): + path_reshaped = path.replace(".epc", "_reshaped.epc") + path_reshaped_seq = path.replace(".epc", "_reshaped_seq.epc") + shutil.copy(path, path_reshaped) + shutil.copy(path, path_reshaped_seq) + EpcStreamReader( + epc_file_path=path_reshaped, enable_parallel_rels=True, rels_update_mode=RelsUpdateMode.UPDATE_ON_CLOSE + ) + EpcStreamReader( + epc_file_path=path_reshaped_seq, enable_parallel_rels=False, rels_update_mode=RelsUpdateMode.UPDATE_ON_CLOSE + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + + # main((sys.argv[1] if len(sys.argv) > 1 else None) or "wip/80wells_surf.epc") + + # test_create_epc("wip/test_create.epc") + # test_create_epc_v2("wip/test_create.epc") + # test_create_epc_v3_with_different_external_files("wip/test_create_v3.epc") + + # recompute_rels_change_name(sys.argv[1] if len(sys.argv) > 1 else "wip/failingData/fix/S-PASS-1-EARTHMODEL_ONLY.epc") + # recompute_rels_change_name(sys.argv[1] if len(sys.argv) > 1 else "wip/failingData/fix/S-PASS-1-GEOMODEL.epc") + recompute_rels_change_name( + sys.argv[1] if len(sys.argv) > 1 else "wip/failingData/fix/sample_mini_firp_201_norels_with_media.epc" + ) diff --git a/energyml-utils/example/main_test_3D.py b/energyml-utils/example/attic/main_test_3D.py similarity index 83% rename from energyml-utils/example/main_test_3D.py rename to energyml-utils/example/attic/main_test_3D.py index 0657bdf..e6a90cb 100644 --- a/energyml-utils/example/main_test_3D.py +++ b/energyml-utils/example/attic/main_test_3D.py @@ -44,12 +44,12 @@ def export_all_representation(epc_path: str, output_dir: str, regex_type_filter: mesh_list=mesh_list, out=f, ) - export_stl_path = path.with_suffix(".stl") - with export_stl_path.open("wb") as stl_f: - export_stl( - mesh_list=mesh_list, - out=stl_f, - ) + # export_stl_path = path.with_suffix(".stl") + # with export_stl_path.open("wb") as stl_f: + # export_stl( + # mesh_list=mesh_list, + # out=stl_f, + # ) export_vtk_path = path.with_suffix(".vtk") with export_vtk_path.open("wb") as vtk_f: export_vtk( @@ -103,18 +103,18 @@ def export_all_representation_in_memory(epc_path: str, output_dir: str, regex_ty mesh_list=mesh_list, out=f, ) - export_stl_path = path.with_suffix(".stl") - with export_stl_path.open("wb") as stl_f: - export_stl( - mesh_list=mesh_list, - out=stl_f, - ) - export_vtk_path = path.with_suffix(".vtk") - with export_vtk_path.open("wb") as vtk_f: - export_vtk( - mesh_list=mesh_list, - out=vtk_f, - ) + # export_stl_path = path.with_suffix(".stl") + # with export_stl_path.open("wb") as stl_f: + # export_stl( + # mesh_list=mesh_list, + # out=stl_f, + # ) + # export_vtk_path = path.with_suffix(".vtk") + # with export_vtk_path.open("wb") as vtk_f: + # export_vtk( + # mesh_list=mesh_list, + # out=vtk_f, + # ) logging.info(f" ✓ Exported to {path.name}") except NotSupportedError: @@ -135,8 +135,9 @@ def export_all_representation_in_memory(epc_path: str, output_dir: str, regex_ty import logging logging.basicConfig(level=logging.DEBUG) + epc_file = "rc/epc/testingPackageCpp22.epc" # epc_file = "rc/epc/testingPackageCpp.epc" - epc_file = "rc/epc/output-val.epc" + # epc_file = "rc/epc/output-val.epc" # epc_file = "rc/epc/Volve_Horizons_and_Faults_Depth_originEQN.epc" output_directory = Path("exported_meshes") / Path(epc_file).name.replace(".epc", "_3D_export") # export_all_representation(epc_file, output_directory) diff --git a/energyml-utils/example/mainjson.py b/energyml-utils/example/attic/mainjson.py similarity index 100% rename from energyml-utils/example/mainjson.py rename to energyml-utils/example/attic/mainjson.py diff --git a/energyml-utils/example/attic/misc_test.py b/energyml-utils/example/attic/misc_test.py new file mode 100644 index 0000000..2a3988c --- /dev/null +++ b/energyml-utils/example/attic/misc_test.py @@ -0,0 +1,32 @@ +from energyml.utils.epc_utils import get_dor_uris_from_obj +from energyml.utils.introspection import get_obj_uri, search_attribute_matching_type_with_path +from energyml.utils.serialization import ( + serialize_xml, + read_energyml_xml_str, + read_energyml_xml_file, + read_energyml_xml_bytes, + read_energyml_json_str, + read_energyml_json_bytes, + JSON_VERSION, +) + + +def test_as_uri(xml_path: str): + obj = read_energyml_xml_file(xml_path) + + # print(obj) + + for uri in get_dor_uris_from_obj(obj): + print(uri) + print("=" * 40) + print(obj.category_lookup) + print(get_obj_uri(obj.category_lookup)) + + print("=" * 40) + for p, o in search_attribute_matching_type_with_path(obj, "DataObjectreference"): + print(f"{p}: {o} ({get_obj_uri(o)})\n") + + +if __name__ == "__main__": + # test_as_uri("rc/ContinuousProperty_1d34249c-4c4f-4705-870e-b5dea9c0d78e.xml") + test_as_uri("rc/DiscreteProperty.xml") diff --git a/energyml-utils/example/attic/parsing_improvement_test.py b/energyml-utils/example/attic/parsing_improvement_test.py new file mode 100644 index 0000000..a728fb1 --- /dev/null +++ b/energyml-utils/example/attic/parsing_improvement_test.py @@ -0,0 +1,153 @@ +""" +Test for parsing. + +To test : edit _read_energyml_xml_bytes_as_class in serialization.py : + +__ENV__IMPROVEMENT__ = "__ENV__IMPROVEMENT__" +"__ENV__IMPROVEMENT_LXML__" = ""__ENV__IMPROVEMENT_LXML__"" + + if os.environ.get(__ENV__IMPROVEMENT__, "0") == "0": + if os.environ.get("__ENV__IMPROVEMENT_LXML__", "0") == "1": + parser = XmlParser(config=config, handler=LxmlEventHandler) + else: + parser = XmlParser(config=config) + else: + if os.environ.get("__ENV__IMPROVEMENT_LXML__", "0") == "1": + parser = XmlParser(config=config, context=GLOBAL_XML_CONTEXT, handler=LxmlEventHandler) + else: + parser = XmlParser(config=config, context=GLOBAL_XML_CONTEXT) + +""" + +import logging +import operator +import os +import sys +import time +from typing import Optional + +from energyml.utils.epc import Epc +from energyml.utils.introspection import ( + search_class_in_module_from_partial_name, +) +from energyml.utils.manager import get_related_energyml_modules_name +from energyml.utils.serialization import read_energyml_xml_file, serialize_json + + +def reexport_in_memory_par_read(filepath: str, output_folder: Optional[str] = None): + is_opti = os.environ.get("__ENV__IMPROVEMENT__", "0") == "1" + + suffix = "opti" if is_opti else "std" + if os.environ.get("__ENV__IMPROVEMENT_LXML__", "0") == "1": + suffix += "_lxml" + if os.environ.get("__ENV__IMPROVEMENT__GET_MEMBER__", "0") == "1": + suffix += "_get_member" + + path_in_memory = filepath.replace(".epc", f"_parsing_imp_xml_{suffix}.epc") + if output_folder: + os.makedirs(output_folder, exist_ok=True) + path_in_memory = f"{output_folder}/{path_in_memory.split('/')[-1]}" + epc = Epc.read_file(epc_file_path=filepath, read_rels_from_files=False, read_parallel=True, recompute_rels=False) + + if os.path.exists(path_in_memory): + os.remove(path_in_memory) + epc.export_file(path_in_memory, parallel=True) + + +# =================================== + + +def time_test(f: callable, **kwargs): + print(f" Testing {f.__name__}...") + start = time.perf_counter() + f(**kwargs) + elapsed_inmem = time.perf_counter() - start + # results.append(("In-Memory (Epc)", elapsed_inmem)) + print(f" Completed in {elapsed_inmem:.3f}s\n") + return ("In-Memory (Epc)", elapsed_inmem) + + +if __name__ == "__main__xmlcontext__": + logging.basicConfig(level=logging.DEBUG) + + os.environ["__ENV__IMPROVEMENT__"] = "0" + os.environ["__ENV__IMPROVEMENT_LXML__"] = "0" + + time_test( + reexport_in_memory_par_read, + filepath=sys.argv[1] if len(sys.argv) > 1 else "rc/epc/80wells_surf.epc", + output_folder="results", + ) + + os.environ["__ENV__IMPROVEMENT__"] = "1" + time_test( + reexport_in_memory_par_read, + filepath=sys.argv[1] if len(sys.argv) > 1 else "rc/epc/80wells_surf.epc", + output_folder="results", + ) + + os.environ["__ENV__IMPROVEMENT__"] = "1" + os.environ["__ENV__IMPROVEMENT_LXML__"] = "1" + time_test( + reexport_in_memory_par_read, + filepath=sys.argv[1] if len(sys.argv) > 1 else "rc/epc/80wells_surf.epc", + output_folder="results", + ) + +if __name__ == "__main__": + from energyml.resqml.v2_2.resqmlv2 import TriangulatedSetRepresentation + + print(TriangulatedSetRepresentation.__class__.__module__) + print(TriangulatedSetRepresentation.__dataclass_fields__.keys()) + + # logging.basicConfig(level=logging.DEBUG) + + # os.environ["__ENV__IMPROVEMENT__GET_MEMBER__"] = "0" + + time_test( + reexport_in_memory_par_read, + filepath=sys.argv[1] if len(sys.argv) > 1 else "rc/epc/80wells_surf.epc", + output_folder="results", + ) + + # os.environ["__ENV__IMPROVEMENT__GET_MEMBER__"] = "1" + # time_test( + # reexport_in_memory_par_read, + # filepath=sys.argv[1] if len(sys.argv) > 1 else "rc/epc/80wells_surf.epc", + # output_folder="results", + # ) + # class Test: + # def __init__(self): + # self.geometry = 1 + + # def hello(self): + # print("Hello") + + +if __name__ == "__main__2": + + grid = read_energyml_xml_file("rc/Grid2dRepresentation_78bf01c0-d5bb-46d3-aa70-9cc4ee5c8230.xml") + + print(serialize_json(grid)) + + # print(operator.attrgetter("geometry.points.zvalues.values.external_data_array_part.0")(grid)) + + test_dict = {"geometry": {"points": {"zvalues": {"values": {"external_data_array_part": ["test"]}}}}} + + print(operator.attrgetter("geometry.points.zvalues.values.external_data_array_part.0")(test_dict)) + + +if __name__ == "__main__": + + # print(is_abstract(Test)) + + # print(len(get_module_classes("energyml.resqml.v2_2.resqmlv2"))) + # print(get_module_classes_old("energyml.resqml.v2_2.resqmlv2")) + + # tr = TriangulatedSetRepresentation() + # print(get_class_methods(Epc))* + + # print(RELATED_MODULES_MAP) + # print(get_related_energyml_modules_name("energyml.resqml.v2_2.resqmlv2")) + + print(len(search_class_in_module_from_partial_name("energyml.resqml.v2_2.resqmlv2", "Representation"))) diff --git a/energyml-utils/example/attic/perf_tests.py b/energyml-utils/example/attic/perf_tests.py new file mode 100644 index 0000000..645be1a --- /dev/null +++ b/energyml-utils/example/attic/perf_tests.py @@ -0,0 +1,79 @@ +# Benchmark de performance pour get_obj_uuid +import time +import re + +UUID_RGX: re.Pattern = re.compile(r"[Uu]u?id|UUID") + + +# Version dot +def get_obj_uuid_pointe(obj): + try: + return obj.uuid + except AttributeError: + try: + return obj.uid + except AttributeError: + if isinstance(obj, dict): + for k in obj.keys(): + if UUID_RGX.match(k): + return obj[k] + return None + + +# Version originale +def get_obj_uuid_original(obj): + try: + return getattr(obj, "uuid", None) or getattr(obj, "uid") + except AttributeError: + if isinstance(obj, dict): + for k in obj.keys(): + if UUID_RGX.match(k): + return obj[k] + return None + + +# Version optimisée +def get_obj_uuid_fast(obj): + for attr in dir(obj): + if UUID_RGX.match(attr): + value = getattr(obj, attr, None) + if value is not None: + return value + if isinstance(obj, dict): + for k, v in obj.items(): + if UUID_RGX.match(k): + if v is not None: + return v + return None + + +# Simulation d'une classe TriangulatedSetRepresentation +class TriangulatedSetRepresentation: + def __init__(self, uuid): + self.uuid = uuid + + +N = 10000 +objs = [TriangulatedSetRepresentation(f"uuid-{i}") for i in range(N)] + +# Test version originale +start = time.perf_counter() +for obj in objs: + assert get_obj_uuid_original(obj) == obj.uuid +elapsed_original = time.perf_counter() - start + +# Test version optimisée +start = time.perf_counter() +for obj in objs: + assert get_obj_uuid_fast(obj) == obj.uuid +elapsed_fast = time.perf_counter() - start + +# Test version pointe +start = time.perf_counter() +for obj in objs: + assert get_obj_uuid_pointe(obj) == obj.uuid +elapsed_point = time.perf_counter() - start + +print(f"Original version: {elapsed_original:.6f} s for {N} calls") +print(f"Optimized version: {elapsed_fast:.6f} s for {N} calls") +print(f"Point version: {elapsed_point:.6f} s for {N} calls") diff --git a/energyml-utils/example/attic/test_list_object.py b/energyml-utils/example/attic/test_list_object.py new file mode 100644 index 0000000..7fa4934 --- /dev/null +++ b/energyml-utils/example/attic/test_list_object.py @@ -0,0 +1,71 @@ +from energyml.utils.epc_stream import EpcStreamReader, RelsUpdateMode +from datetime import datetime + + +def list_epc_classical(epc_file): + """List contents of an EPC file.""" + + if not isinstance(epc_file, list): + epc_file = [epc_file] + + for f in epc_file: + print(f"Processing EPC file: {f}") + epc = EpcStreamReader(f, rels_update_mode=RelsUpdateMode.MANUAL) + + time_start = datetime.now() + # for obj in epc.list_objects(): + # print(f"Object: {obj}") + print(len(epc.list_objects(object_type="resqml22.BoundaryFeature"))) + + for obj in sorted(epc.list_objects(object_type="resqml22.BoundaryFeature"), key=lambda o: o.title): + print(f"BoundaryFeature: {obj}") + for obj in sorted(epc.list_objects(object_type="resqml22.RockVolumeFeature"), key=lambda o: o.title): + print(f"RockVolumeFeature: {obj}") + time_end = datetime.now() + print(f"Time taken: {time_end - time_start}") + + +# def list_epc_fast(epc_file): +# """List contents of an EPC file using fast method.""" +# epc = EpcStreamReader( +# epc_file, +# rels_update_mode=RelsUpdateMode.MANUAL, +# ) + +# time_start = datetime.now() +# # for obj in epc.list_objects_parallel(): +# # print(f"Object: {obj}") +# print(len(epc.list_objects_parallel())) +# time_end = datetime.now() +# print(f"Time taken: {time_end - time_start}") + + +# def list_epc_seq(epc_file): +# """List contents of an EPC file using sequential method.""" +# epc = EpcStreamReader( +# epc_file, +# rels_update_mode=RelsUpdateMode.MANUAL, +# ) + +# time_start = datetime.now() +# # for obj in epc.list_objects_seq(): +# # print(f"Object: {obj}") +# print(len(epc.list_objects_seq())) +# time_end = datetime.now() +# print(f"Time taken: {time_end - time_start}") + + +if __name__ == "__main__": + epc_file = [ + "D:/Geosiris/Clients/BRGM/git/pointset-extraction/rc/output/full-local/full-local.epc", + "D:/Geosiris/Clients/BRGM/git/csv-to-energyml/rc/output/full-local/result-out-local-egis-full.epc", + ] + # epc_file = "D:/Geosiris/Clients/BRGM/git/pointset-extraction/rc/output/full-local/full-local.epc" + print("Listing EPC contents (classical method):") + list_epc_classical(epc_file) + + # print("Listing EPC contents (fast method):") + # list_epc_fast(epc_file) + + # print("Listing EPC contents (sequential method):") + # list_epc_seq(epc_file) diff --git a/energyml-utils/tests/test_parallel_rels_performance.py b/energyml-utils/example/attic/test_parallel_rels_performance.py similarity index 100% rename from energyml-utils/tests/test_parallel_rels_performance.py rename to energyml-utils/example/attic/test_parallel_rels_performance.py diff --git a/energyml-utils/example/attic/validate_epc_example.py b/energyml-utils/example/attic/validate_epc_example.py new file mode 100644 index 0000000..8a5f5e2 --- /dev/null +++ b/energyml-utils/example/attic/validate_epc_example.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 +""" +Example script demonstrating EPC validation. + +This script shows how to validate EPC files and generate reports. +""" + +import sys +from pathlib import Path + +from energyml.utils.epc_validator import validate_epc_file + + +def validate_single_file(epc_path: str) -> None: + """Validate a single EPC file and print results.""" + print(f"\n{'=' * 70}") + print(f"Validating: {epc_path}") + print(f"{'=' * 70}\n") + + try: + result = validate_epc_file(epc_path, strict=True, check_relationships=True) + + print(result) + + if result.is_valid: + print("\n✓ Validation PASSED!") + else: + print("\n✗ Validation FAILED!") + sys.exit(1) + + except Exception as e: + print(f"\n✗ Error during validation: {e}") + sys.exit(1) + + +def validate_directory(directory: str) -> None: + """Validate all EPC files in a directory.""" + print(f"\n{'=' * 70}") + print(f"Validating all EPC files in: {directory}") + print(f"{'=' * 70}\n") + + epc_files = list(Path(directory).glob("**/*.epc")) + + if not epc_files: + print(f"No EPC files found in {directory}") + return + + print(f"Found {len(epc_files)} EPC file(s)\n") + + results = {} + for epc_file in epc_files: + print(f"Validating {epc_file.name}...", end=" ") + result = validate_epc_file(str(epc_file)) + + if result.is_valid: + print("✓ PASSED") + else: + print("✗ FAILED") + for error in result.errors[:3]: # Show first 3 errors + print(f" - {error}") + if len(result.errors) > 3: + print(f" ... and {len(result.errors) - 3} more errors") + + results[epc_file.name] = result + + # Summary + print(f"\n{'=' * 70}") + print("SUMMARY") + print(f"{'=' * 70}") + passed = sum(1 for r in results.values() if r.is_valid) + failed = len(results) - passed + print(f"Total files: {len(results)}") + print(f"Passed: {passed}") + print(f"Failed: {failed}") + + +def main(): + """Main entry point.""" + if len(sys.argv) < 2: + print("Usage:") + print(f" {sys.argv[0]} # Validate a single file") + print(f" {sys.argv[0]} # Validate all EPC files in directory") + sys.exit(1) + + path = sys.argv[1] + + if Path(path).is_file(): + validate_single_file(path) + elif Path(path).is_dir(): + validate_directory(path) + else: + print(f"Error: '{path}' is neither a file nor a directory") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/energyml-utils/example/main_test_numpy_export.py b/energyml-utils/example/main_test_numpy_export.py new file mode 100644 index 0000000..def7d64 --- /dev/null +++ b/energyml-utils/example/main_test_numpy_export.py @@ -0,0 +1,219 @@ +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 +""" +Example: export NumpyMultiMesh objects from an EPC file to all supported formats. + +Demonstrates: + - Reading meshes via read_numpy_mesh_object (NumpyMultiMesh) + - Building RepresentationContext per object for colour metadata + - Exporting to OBJ (+.mtl), GeoJSON, VTK Legacy ASCII, VTK Legacy Binary, + VTK XML UnstructuredGrid (.vtu), VTK XML PolyData (.vtp), STL + - Two passes: with and without CRS displacement + +Usage:: + + # from the workspace root + poetry run python example/main_test_numpy_export.py + + # defaults (uses bundled test EPC files when no args are given) + poetry run python example/main_test_numpy_export.py +""" + +import datetime +import logging +import os +import re +import sys +import traceback +from pathlib import Path +from typing import Dict, Optional + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)-7s %(message)s", + stream=sys.stdout, +) +log = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Lazy import guards — pyvista is strictly optional +# --------------------------------------------------------------------------- +try: + from energyml.utils.data.mesh_numpy import read_numpy_mesh_object + from energyml.utils.data.representation_context import RepresentationContext + from energyml.utils.data.export import ( + ExportFormat, + VTKExportOptions, + VTKFormat, + STLExportOptions, + GeoJSONExportOptions, + export_mesh, + ) + from energyml.utils.epc_stream import EpcStreamReader + from energyml.utils.epc import Epc + from energyml.utils.exception import NotSupportedError + from energyml.utils.introspection import get_obj_uuid +except ImportError as exc: + log.error("Could not import energyml-utils modules: %s", exc) + sys.exit(1) + + +# --------------------------------------------------------------------------- +# Core export routine +# --------------------------------------------------------------------------- + + +def export_all_numpy( + epc_path: str, + output_dir: str, + regex_type_filter: Optional[str] = None, + use_crs_displacement: bool = True, +) -> None: + """Read every Representation in *epc_path* via the numpy pipeline and + export it to all supported formats. + + :param epc_path: Path to the ``.epc`` file. + :param output_dir: Directory where output files are written (created if absent). + :param regex_type_filter: Optional regex; only objects whose type name matches + are exported (case-insensitive). + :param use_crs_displacement: When True, CRS origin/axis offsets are applied to + the exported coordinates. Two passes are run by the top-level script: one + with True and one with False. + """ + tag = "crs" if use_crs_displacement else "nocrs" + # storage = EpcStreamReader(epc_path, keep_open=True) + storage = Epc.read_file(epc_path) + dt = datetime.datetime.now().strftime("%Hh%M_%d-%m-%Y") + + not_supported_types: set = set() + exported_count = 0 + + for mdata in storage.list_objects(): + if "Representation" not in mdata.object_type: + continue + if regex_type_filter and not re.search(regex_type_filter, mdata.object_type, flags=re.IGNORECASE): + continue + + log.info("Processing %s (%s)", mdata.object_type, mdata.uuid) + energyml_obj = storage.get_object_by_uuid(mdata.uuid)[0] + + try: + # ---- 1. Read as NumpyMultiMesh -------------------------------- + multi_mesh = read_numpy_mesh_object( + energyml_object=energyml_obj, + workspace=storage, + # Read with displacement=False so the exporter controls it. + use_crs_displacement=False, + ) + + if multi_mesh is None or multi_mesh.patch_count() == 0: + log.info(" → no patches, skipping.") + continue + + # ---- 2. Build RepresentationContext for colour metadata -------- + ctx = RepresentationContext(energyml_obj, storage) + source_uuid = get_obj_uuid(energyml_obj) + contexts: Dict[str, RepresentationContext] = {source_uuid: ctx} + + # Also index children by their source_uuid for colour lookup + for patch in multi_mesh.flat_patches(): + patch_uuid = patch.source_uuid + if patch_uuid and patch_uuid not in contexts: + patch_obj = storage.get_object_by_uuid(patch_uuid) + if patch_obj: + contexts[patch_uuid] = RepresentationContext(patch_obj[0], storage) + + # ---- 3. Prepare output directory / base filename --------------- + os.makedirs(output_dir, exist_ok=True) + stem = f"{dt}-{mdata.object_type}_{mdata.uuid}_{tag}" + base = Path(output_dir) / stem + + # ---- 4. Export to every format --------------------------------- + formats_to_export = [ + (f"{base}.obj", ExportFormat.OBJ, None), + (f"{base}.geojson", ExportFormat.GEOJSON, GeoJSONExportOptions(indent=None)), + (f"{base}.vtk", ExportFormat.VTK, VTKExportOptions(vtk_format=VTKFormat.LEGACY_ASCII)), + (f"{base}_binary.vtk", ExportFormat.VTK, VTKExportOptions(vtk_format=VTKFormat.LEGACY_BINARY)), + (f"{base}.vtu", ExportFormat.VTU, VTKExportOptions(vtk_format=VTKFormat.VTU)), + (f"{base}.vtp", ExportFormat.VTP, VTKExportOptions(vtk_format=VTKFormat.VTP)), + (f"{base}_binary.stl", ExportFormat.STL, STLExportOptions(binary=True)), + (f"{base}_ascii.stl", ExportFormat.STL, STLExportOptions(binary=False)), + ] + + for path_str, fmt, opts in formats_to_export: + try: + export_mesh( + mesh_list=multi_mesh, + output_path=path_str, + format=fmt, + options=opts, + contexts=contexts, + use_crs_displacement=use_crs_displacement, + ) + log.info(" ✓ %s", Path(path_str).name) + except Exception: # noqa: BLE001 + log.warning(" ✗ %s — export failed:", Path(path_str).name) + traceback.print_exc() + + exported_count += 1 + + except NotSupportedError as e: + not_supported_types.add(mdata.object_type) + log.debug(" Not supported: %s", e) + except Exception: + traceback.print_exc() + + log.info("") + log.info("Done. Exported %d objects -> %s", exported_count, output_dir) + if not_supported_types: + log.info("Unsupported representation types skipped:") + for t in sorted(not_supported_types): + log.info(" - %s", t) + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + # Allow: main_test_numpy_export.py [epc_path] [output_dir] + args = sys.argv[1:] + + if len(args) >= 1: + epc_file = args[0] + else: + # Fall back to the bundled test EPC in the workspace + candidates = [ + "rc/epc/testingPackageCpp22.epc", + "rc/epc/testingPackageCpp.epc", + ] + epc_file = next((p for p in candidates if Path(p).exists()), None) + if epc_file is None: + log.error( + "No EPC file found. Pass a path as the first argument or place a " + ".epc file at rc/epc/testingPackageCpp22.epc" + ) + sys.exit(1) + + base_output = args[1] if len(args) >= 2 else "exported_meshes/numpy_export" + + log.info("=" * 60) + log.info("EPC : %s", epc_file) + log.info("OUT : %s", base_output) + log.info("=" * 60) + + # Pass 1 — with CRS displacement + log.info("\n--- Pass 1: use_crs_displacement=True ---\n") + export_all_numpy( + epc_path=epc_file, + output_dir=f"{base_output}/with_crs", + use_crs_displacement=True, + ) + + # Pass 2 — raw coordinates (no CRS displacement) + log.info("\n--- Pass 2: use_crs_displacement=False ---\n") + export_all_numpy( + epc_path=epc_file, + output_dir=f"{base_output}/no_crs", + use_crs_displacement=False, + ) diff --git a/energyml-utils/example/tools.py b/energyml-utils/example/tools.py index 20dfe69..938a058 100644 --- a/energyml-utils/example/tools.py +++ b/energyml-utils/example/tools.py @@ -4,6 +4,7 @@ import json import os import pathlib +import traceback from typing import Optional, List, Dict, Any import sys from pathlib import Path @@ -12,7 +13,7 @@ src_path = Path(__file__).parent.parent / "src" sys.path.insert(0, str(src_path)) -from energyml.utils.validation import validate_epc +from energyml.utils.validation import ErrorType, validate_epc from energyml.utils.constants import get_property_kind_dict_path_as_xml from energyml.utils.data.datasets_io import CSVFileReader, HDF5FileWriter, ParquetFileWriter, DATFileReader @@ -20,6 +21,7 @@ from energyml.utils.epc import Epc, gen_energyml_object_path from energyml.utils.introspection import ( get_class_from_simple_name, + get_enum_values, get_module_name_and_type_from_content_or_qualified_type, random_value_from_class, search_class_in_module_from_partial_name, @@ -548,6 +550,26 @@ def validate_files(): parser = argparse.ArgumentParser() # parser.add_argument("--folder", type=str, help="Input folder") parser.add_argument("--file", "-f", type=str, help="Input file (json or xml or epc)") + parser.add_argument( + "--ignore-err-type", + "-i", + type=str, + help=f"Error types to ignore. Possible values {get_enum_values(ErrorType)}", + nargs="*", + ) + + parser.add_argument( + "--ignore-prodml-version-errs", + action="store_false", + dest="ignore_prodml_version_errs", + help="Disable ignoring errors related to Prodml version (by default, these errors are ignored)", + ) + + parser.add_argument( + "--group-by-err-class", + action="store_true", + help="Group errors by their class (e.g. all validation errors together, all parsing errors together, etc.)", + ) args = parser.parse_args() @@ -615,14 +637,36 @@ def validate_files(): else: print(f"File {filename} is NOT a valid EnergyML EPC file: Empty EPC") except Exception as e: + traceback.print_exc() print(f"File {filename} is NOT a valid EnergyML EPC file: {e}") epc = Epc() epc.energyml_objects = objects - err_json = [err.toJson() for err in validate_epc(epc)] + err_json = [ + err.toJson() + for err in validate_epc(epc) + if str(err.error_type).lower() not in (et.lower() for et in (args.ignore_err_type or [])) + ] - print(json.dumps(err_json, indent=4)) + err_json_sorted = sorted( + err_json, key=lambda x: (x["err_class"], x["error_type"], x["object_uuid"] if "object_uuid" in x else "") + ) + + if args.ignore_prodml_version_errs: + err_json_sorted = [err for err in err_json_sorted if not ("prodml23" in err.get("msg", ""))] + + if args.group_by_err_class: + err_json_grouped = {} + for err in err_json_sorted: + err_class = err.get("err_class", "UnknownErrorClass") + if err_class not in err_json_grouped: + err_json_grouped[err_class] = [] + err_json_grouped[err_class].append(err) + print(json.dumps(err_json_grouped, indent=4)) + else: + # print(json.dumps(err_json, indent=4)) + print(json.dumps(err_json_sorted, indent=4)) # def export_wavefront(): diff --git a/energyml-utils/pyproject.toml b/energyml-utils/pyproject.toml index 4ce977f..e3f4825 100644 --- a/energyml-utils/pyproject.toml +++ b/energyml-utils/pyproject.toml @@ -60,18 +60,21 @@ python_functions = [ "test_*" ] [tool.poetry.extras] parquet = ["pyarrow", "numpy", "pandas"] hdf5 = ["h5py"] +las = ["lasio"] +segy = ["segyio"] [tool.poetry.dependencies] python = "^3.9" xsdata = {version = "^24.0", extras = ["cli", "lxml"]} energyml-opc = "^1.12.0" -h5py = { version = "^3.7.0", optional = false } -pyarrow = { version = "^14.0.1", optional = false } +h5py = { version = "^3.11.0", optional = false } numpy = { version = "^1.16.6", optional = false } -flake8 = "^7.3.0" +pyarrow = { version = "^14.0.1", optional = true } +pandas = { version = "^1.1.0", optional = true } +lasio = { version = "^0.31", optional = true } +segyio = { version = "^1.9", optional = true } [tool.poetry.group.dev.dependencies] -pandas = { version = "^1.1.0", optional = false } coverage = {extras = ["toml"], version = "^6.2"} pytest = "^8.1.1" pytest-cov = "^4.1.0" @@ -80,7 +83,8 @@ black = "^22.3.0" pylint = "^2.7.2" click = ">=8.1.3, <=8.1.3" # upper version than 8.0.2 fail with black pdoc3 = "^0.10.0" -pydantic = { version = "^2.0", optional = true } +snakeviz = "^2.1.0" # code perf tests +pydantic = { version = "^2.0"} energyml-common2-0 = "^1.12.0" energyml-common2-1 = "^1.12.0" energyml-common2-2 = "^1.12.0" diff --git a/energyml-utils/rc/epc/README.md b/energyml-utils/rc/epc/README.md new file mode 100644 index 0000000..1411d95 --- /dev/null +++ b/energyml-utils/rc/epc/README.md @@ -0,0 +1 @@ +TestingPackage epc + h5 files comes from FESAPI library : https://fastapi.tiangolo.com/ \ No newline at end of file diff --git a/energyml-utils/rc/epc/testingPackageCpp.epc b/energyml-utils/rc/epc/testingPackageCpp.epc new file mode 100644 index 0000000..0987e95 Binary files /dev/null and b/energyml-utils/rc/epc/testingPackageCpp.epc differ diff --git a/energyml-utils/rc/epc/testingPackageCpp.h5 b/energyml-utils/rc/epc/testingPackageCpp.h5 index 21035b0..996966d 100644 Binary files a/energyml-utils/rc/epc/testingPackageCpp.h5 and b/energyml-utils/rc/epc/testingPackageCpp.h5 differ diff --git a/energyml-utils/rc/epc/testingPackageCpp22.epc b/energyml-utils/rc/epc/testingPackageCpp22.epc new file mode 100644 index 0000000..855625a Binary files /dev/null and b/energyml-utils/rc/epc/testingPackageCpp22.epc differ diff --git a/energyml-utils/rc/epc/testingPackageCpp22.h5 b/energyml-utils/rc/epc/testingPackageCpp22.h5 new file mode 100644 index 0000000..2cbd0be Binary files /dev/null and b/energyml-utils/rc/epc/testingPackageCpp22.h5 differ diff --git a/energyml-utils/src/energyml/utils/constants.py b/energyml-utils/src/energyml/utils/constants.py index 5735660..4c9b3d2 100644 --- a/energyml-utils/src/energyml/utils/constants.py +++ b/energyml-utils/src/energyml/utils/constants.py @@ -49,7 +49,7 @@ ENERGYML_MODULES_NAMES = ["eml", "prodml", "witsml", "resqml"] -RELATED_MODULES = [ +_RELATED_MODULES = [ ["energyml.eml.v2_0.commonv2", "energyml.resqml.v2_0_1.resqmlv2"], [ "energyml.eml.v2_1.commonv2", @@ -65,6 +65,11 @@ ], ] +RELATED_MODULES_MAP = {} +for group in _RELATED_MODULES: + for module in group: + RELATED_MODULES_MAP[module] = group + # =================================== # REGEX PATTERN STRINGS (for reference) # =================================== @@ -113,7 +118,7 @@ RGX_XML_HEADER = r"^\s*<\?xml(\s+(encoding\s*=\s*\"(?P[^\"]+)\"|version\s*=\s*\"(?P[^\"]+)\"|standalone\s*=\s*\"(?P[^\"]+)\"))+" -RGX_IDENTIFIER = rf"{RGX_UUID}(.(?P\w+)?)?" +RGX_IDENTIFIER = rf"{RGX_UUID}.((?P\w+)?)?" # URI regex components URI_RGX_GRP_DOMAIN = "domain" @@ -208,10 +213,13 @@ class OptimizedRegex: # CONSTANTS AND ENUMS # =================================== +# TODO: RELS_CONTENT_TYPE may be incorrect or not well named, needs review RELS_CONTENT_TYPE = "application/vnd.openxmlformats-package.core-properties+xml" RELS_FOLDER_NAME = "_rels" +CORE_PROPERTIES_FOLDER_NAME = "docProps" -primitives = (bool, str, int, float, type(None)) +# primitives = (bool, str, int, float, type(None)) +primitives = {bool, str, int, float, bytes, type(None)} class MimeType(Enum): @@ -222,6 +230,22 @@ class MimeType(Enum): PARQUET = "application/x-parquet" PDF = "application/pdf" RELS = "application/vnd.openxmlformats-package.relationships+xml" + CORE_PROPERTIES = "application/vnd.openxmlformats-package.core-properties+xml" + EXTENDED_CORE_PROPERTIES = "application/x-extended-core-properties+xml" + JPEG = "image/jpeg" + PNG = "image/png" + TIFF = "image/tiff" + GIF = "image/gif" + SVG = "image/svg+xml" + DOC = "application/msword" + DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + XML = "application/xml" + JSON = "application/json" + TXT = "text/plain" + MARKDOWN = "text/markdown" + HTML = "text/html" + ZIP = "application/zip" def __str__(self): return self.value @@ -237,17 +261,26 @@ class EpcExportVersion(Enum): class EPCRelsRelationshipType(Enum): """EPC relationships types with proper URL generation""" - # Standard relationship types DESTINATION_OBJECT = "destinationObject" + """The object in Target is the destination of the relationship.""" SOURCE_OBJECT = "sourceObject" + """The current object is the source in the relationship with the target object.""" ML_TO_EXTERNAL_PART_PROXY = "mlToExternalPartProxy" + """The target object is a proxy object for an external file.""" EXTERNAL_PART_PROXY_TO_ML = "externalPartProxyToMl" + """The current object is used as a proxy object by the target object.""" EXTERNAL_RESOURCE = "externalResource" + """The target is a resource outside of the EPC package. Note that TargetMode should be "External" for this relationship.""" DestinationMedia = "destinationMedia" + """The object in Target is a media representation for the current object. As a guideline, media files should be stored in a "media" folder in the root of the package.""" SOURCE_MEDIA = "sourceMedia" + """The current object is a media representation for the object in Target.""" CHUNKED_PART = "chunkedPart" + """The target is part of a larger data object that has been chunked into several smaller files.""" CORE_PROPERTIES = "core-properties" - EXTENDED_CORE_PROPERTIES = "extended-core-properties" # Not in standard + """Core properties metadata relationship.""" + EXTENDED_CORE_PROPERTIES = "extended-core-properties" + """Extended core properties metadata relationship (not in standard).""" def get_type(self) -> str: """Get the full relationship type URL""" @@ -258,21 +291,151 @@ def get_type(self) -> str: else: return "http://schemas.energistics.org/package/2012/relationships/" + self.value + def __str__(self) -> str: + return self.get_type() + @dataclass class RawFile: """A class for non-energyml files to be stored in an EPC file""" path: str = field(default="_") - content: BytesIO = field(default=None) + content: Optional[BytesIO] = field(default=None) + + +# =================================== +# MIME TYPE MAPPINGS +# =================================== + +# Primary mapping: MimeType enum → file extension +MIME_TYPE_TO_EXTENSION: dict[MimeType, str] = { + MimeType.CSV: "csv", + MimeType.HDF5: "h5", + MimeType.PARQUET: "parquet", + MimeType.PDF: "pdf", + MimeType.RELS: "rels", + MimeType.CORE_PROPERTIES: "xml", + MimeType.EXTENDED_CORE_PROPERTIES: "xml", + MimeType.JPEG: "jpg", + MimeType.PNG: "png", + MimeType.TIFF: "tiff", + MimeType.GIF: "gif", + MimeType.SVG: "svg", + MimeType.DOC: "doc", + MimeType.DOCX: "docx", + MimeType.XLSX: "xlsx", + MimeType.XML: "xml", + MimeType.JSON: "json", + MimeType.TXT: "txt", + MimeType.MARKDOWN: "md", + MimeType.HTML: "html", + MimeType.ZIP: "zip", +} + +# Alternative MIME type strings (aliases and variants) +MIME_TYPE_ALIASES: dict[str, MimeType] = { + "application/parquet": MimeType.PARQUET, + "application/vnd.apache.parquet": MimeType.PARQUET, + "text/xml": MimeType.XML, + "image/jpg": MimeType.JPEG, +} + +# Alternative file extensions +EXTENSION_ALIASES: dict[str, str] = { + "hdf5": "h5", + "jpeg": "jpg", + "tif": "tiff", + "markdown": "md", + "htm": "html", +} + + +def mime_type_to_file_extension(mime_type: str) -> Optional[str]: + """ + Convert MIME type to file extension using the MimeType enum and aliases. + + Args: + mime_type: MIME type string (case-insensitive) + + Returns: + File extension without leading dot, or None if not found + + Examples: + >>> mime_type_to_file_extension("text/csv") + 'csv' + >>> mime_type_to_file_extension("application/parquet") + 'parquet' + """ + if not mime_type: + return None + + mime_type_lower = mime_type.lower() + + # Try to find in MimeType enum + for mime_enum in MimeType: + if mime_enum.value.lower() == mime_type_lower: + return MIME_TYPE_TO_EXTENSION.get(mime_enum) + + # Try aliases + mime_enum = MIME_TYPE_ALIASES.get(mime_type_lower) + if mime_enum: + return MIME_TYPE_TO_EXTENSION.get(mime_enum) + + return None + + +def file_extension_to_mime_type(extension: str) -> Optional[str]: + """ + Convert file extension to MIME type using the MimeType enum. + + Args: + extension: File extension with or without leading dot (case-insensitive) + + Returns: + MIME type string, or None if not found + + Examples: + >>> file_extension_to_mime_type("csv") + 'text/csv' + >>> file_extension_to_mime_type(".json") + 'application/json' + """ + if not extension: + return None + + # Remove leading dot if present + ext_lower = extension.lstrip(".").lower() + + # Normalize through aliases first + ext_normalized = EXTENSION_ALIASES.get(ext_lower, ext_lower) + + # Find the MimeType that matches this extension + for mime_enum, ext in MIME_TYPE_TO_EXTENSION.items(): + if ext == ext_normalized: + return mime_enum.value + + return None # =================================== # OPTIMIZED UTILITY FUNCTIONS # =================================== +_SNAKE_CASE_PATTERNS = [ + (re.compile(r"(.)([A-Z][a-z]+)"), r"\1_\2"), + (re.compile(r"__([A-Z])"), r"_\1"), + (re.compile(r"([a-z0-9])([A-Z])"), r"\1_\2"), +] + def snake_case(string: str) -> str: + """Transform a string into snake_case (optimized with pre-compiled regexes)""" + for pattern, repl in _SNAKE_CASE_PATTERNS: + string = pattern.sub(repl, string) + return string.lower() + + +def snake_case_old(string: str) -> str: """Transform a string into snake_case""" string = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", string) string = re.sub("__([A-Z])", r"_\1", string) @@ -360,11 +523,11 @@ def content_type_to_qualified_type(ct: str) -> Optional[str]: return None -def qualified_type_to_content_type(qt: str) -> Optional[str]: +def qualified_type_to_content_type(qt: str) -> str: """Convert qualified type to content type format""" parsed = parse_content_or_qualified_type(qt) if not parsed: - return None + raise ValueError(f"Failed to parse qualified type: {qt}") try: domain = parsed.group("domain") @@ -376,7 +539,7 @@ def qualified_type_to_content_type(qt: str) -> Optional[str]: return f"application/x-{domain}+xml;" f"version={formatted_version};" f"type={obj_type}" except (AttributeError, KeyError): - return None + raise ValueError(f"Failed to convert qualified type to content type: {qt}") def get_domain_version_from_content_or_qualified_type(cqt: str) -> Optional[str]: @@ -391,6 +554,18 @@ def get_domain_version_from_content_or_qualified_type(cqt: str) -> Optional[str] return None +def get_obj_type_from_content_or_qualified_type(cqt: str) -> str: + """Extract object type (e.g., "WellboreFeature") from content or qualified type""" + parsed = parse_content_or_qualified_type(cqt) + if not parsed: + raise ValueError(f"Failed to parse content or qualified type: {cqt}") + + if parsed.group("type") is None: + raise ValueError(f"Failed to extract object type from content or qualified type: {cqt}") + + return parsed.group("type") + + def split_identifier(identifier: str) -> Tuple[Optional[str], Optional[str]]: """Split identifier into UUID and version components""" if not identifier: @@ -435,6 +610,17 @@ def date_to_epoch(date: str) -> int: raise ValueError(f"Invalid date format: {date}") +def date_to_datetime(date: str) -> datetime.datetime: + """Convert energyml date string to datetime object""" + try: + # Python 3.10 doesn't support 'Z' suffix in fromisoformat() + # Replace 'Z' with '+00:00' for compatibility + date_normalized = date.replace("Z", "+00:00") if date.endswith("Z") else date + return datetime.datetime.fromisoformat(date_normalized) + except (ValueError, TypeError): + raise ValueError(f"Invalid date format: {date}") + + def epoch_to_date(epoch_value: int) -> str: """Convert epoch timestamp to energyml date format""" try: @@ -449,25 +635,16 @@ def gen_uuid() -> str: return str(uuid_mod.uuid4()) -def mime_type_to_file_extension(mime_type: str) -> Optional[str]: - """Convert MIME type to file extension""" - if not mime_type: +def extract_uuid_from_string(s: str) -> Optional[str]: + """Extract UUID from a string using optimized regex""" + if not s: return None - mime_type_lower = mime_type.lower() - - # Use dict for faster lookup than if/elif chain - mime_to_ext = { - "application/x-parquet": "parquet", - "application/parquet": "parquet", - "application/vnd.apache.parquet": "parquet", - "application/x-hdf5": "h5", - "text/csv": "csv", - "application/vnd.openxmlformats-package.relationships+xml": "rels", - "application/pdf": "pdf", - } + match = OptimizedRegex.UUID_NO_GRP.search(s) + if match: + return match.group(0) - return mime_to_ext.get(mime_type_lower) + return None # =================================== @@ -517,6 +694,10 @@ def path_iter(dot_path: str) -> List[str]: return [] +def path_parent_attribute(dot_path: str) -> Optional[str]: + return ".".join(path_iter(dot_path)[:-1]) if dot_path else None + + # =================================== # RESOURCE ACCESS UTILITIES # =================================== @@ -583,3 +764,5 @@ def get_property_kind_dict_path_as_xml() -> str: result = OptimizedRegex.URI.search(test_string) print(f" {name}: {'✓' if result else '✗'} - {test_string[:50]}{'...' if len(test_string) > 50 else ''}") + + print(EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES) diff --git a/energyml-utils/src/energyml/utils/data/__init__.py b/energyml-utils/src/energyml/utils/data/__init__.py index be38189..4a82506 100644 --- a/energyml-utils/src/energyml/utils/data/__init__.py +++ b/energyml-utils/src/energyml/utils/data/__init__.py @@ -6,3 +6,4 @@ Contains functions to help the read of specific entities like Grid2DRepresentation, TriangulatedSetRepresentation etc. It also contains functions to export data into OFF/OBJ format. """ +from energyml.utils.data.crs import CrsInfo, extract_crs_info, apply_from_crs_info, apply_axis_order_swap # noqa: F401 diff --git a/energyml-utils/src/energyml/utils/data/crs.py b/energyml-utils/src/energyml/utils/data/crs.py new file mode 100644 index 0000000..22f42fb --- /dev/null +++ b/energyml-utils/src/energyml/utils/data/crs.py @@ -0,0 +1,959 @@ +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 +""" +CRS (Coordinate Reference System) extraction module. + +Provides a version-neutral ``CrsInfo`` DTO that captures all CRS metadata +relevant for 3D rendering (offsets, UOMs, EPSG codes, rotation / azimuth), +and a single ``extract_crs_info`` factory that handles both: + +- **RESQML v2.0.1** — ``LocalDepth3dCrs`` / ``LocalTime3dCrs`` / + ``AbstractLocal3dCrs`` +- **RESQML v2.2 / EML v2.3** — ``LocalEngineeringCompoundCrs`` → + ``LocalEngineering2dCrs`` + ``VerticalCrs`` + +Usage:: + + from energyml.utils.data.crs import CrsInfo, extract_crs_info + + info: CrsInfo = extract_crs_info(my_crs_obj, workspace=epc) + print(info.projected_epsg_code, info.x_offset, info.z_increasing_downward) +""" +from __future__ import annotations + +import logging +import math +from dataclasses import dataclass, field +from typing import Any, Optional + +import numpy as np + +from energyml.utils.storage_interface import EnergymlStorageInterface +from energyml.utils.introspection import ( + get_obj_uri, + get_obj_uuid, + get_object_attribute, + get_object_attribute_no_verif, + get_object_attribute_rgx, + search_attribute_matching_name, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# DTO +# --------------------------------------------------------------------------- + + +@dataclass +class CrsInfo: + """ + Version-neutral DTO holding all extractable CRS metadata. + + All fields are optional / defaulted so that a ``CrsInfo`` can be returned + even when only partial information could be retrieved (e.g. when + ``workspace`` is ``None`` for a v2.2 compound CRS). + """ + + # ------------------------------------------------------------------ + # Origin offsets (local → project translation) + # ------------------------------------------------------------------ + x_offset: float = 0.0 + """X translation of the local origin in the projected CRS units.""" + + y_offset: float = 0.0 + """Y translation of the local origin in the projected CRS units.""" + + z_offset: float = 0.0 + """Z translation of the local origin in the vertical CRS units.""" + + # ------------------------------------------------------------------ + # Horizontal / projected CRS + # ------------------------------------------------------------------ + projected_epsg_code: Optional[int] = None + """EPSG code of the projected horizontal CRS, if any.""" + + projected_uom: Optional[str] = None + """Unit of measure for XY coordinates (e.g. ``"m"``, ``"ft"``).""" + + projected_axis_order: Optional[str] = None + """Axis order of the projected CRS (e.g. ``"easting northing"``).""" + + projected_wkt: Optional[str] = None + """Well-Known Text representation of the projected CRS, if provided.""" + + projected_unknown: Optional[str] = None + """Free-text CRS descriptor when no authority code / WKT is available.""" + + # ------------------------------------------------------------------ + # Vertical CRS + # ------------------------------------------------------------------ + vertical_epsg_code: Optional[int] = None + """EPSG code of the vertical CRS, if any.""" + + vertical_uom: Optional[str] = None + """Unit of measure for Z coordinates (e.g. ``"m"``, ``"ft"``, ``"s"``).""" + + z_increasing_downward: bool = False + """ + ``True`` when the Z axis increases *downward* (i.e. depth convention). + ``False`` means Z increases *upward* (elevation convention). + """ + + vertical_wkt: Optional[str] = None + """Well-Known Text representation of the vertical CRS, if provided.""" + + vertical_unknown: Optional[str] = None + """Free-text vertical CRS descriptor.""" + + # ------------------------------------------------------------------ + # Rotation / azimuth + # ------------------------------------------------------------------ + areal_rotation_value: float = 0.0 + """ + Rotation angle of the local grid relative to the projected CRS. + Corresponds to ``ArealRotation`` (v2.0.1) or ``Azimuth`` (v2.2). + """ + + areal_rotation_uom: str = "rad" + """Unit of the rotation angle: ``"rad"`` or ``"degr"``.""" + + azimuth_reference: Optional[str] = None + """ + (v2.2 only) Reference for the azimuth, e.g. ``"true north"``, + ``"grid north"``, ``"magnetic north"`` (from ``NorthReferenceKind``). + """ + + # ------------------------------------------------------------------ + # Traceability + # ------------------------------------------------------------------ + source_type: Optional[str] = None + """ + Simple type name of the energyml object this info was extracted from. + Useful for debugging and logging. + """ + + # ------------------------------------------------------------------ + # Convenience helpers + # ------------------------------------------------------------------ + + def areal_rotation_rad(self) -> float: + """Return ``areal_rotation_value`` converted to **radians**.""" + if self.areal_rotation_uom == "degr": + return math.radians(self.areal_rotation_value) + return self.areal_rotation_value + + def as_transform_args(self) -> dict: + """ + Return a kwargs dict ready to be unpacked into + :func:`energyml.utils.data.helper.apply_crs_transform`. + + ``z_is_up=True`` tells ``apply_crs_transform`` to negate Z (converting + from RESQML's depth-positive / z-down convention to the z-up convention + used by most 3-D viewers). This negation is required when the CRS stores + depth as positive Z (``z_increasing_downward=True``). + """ + return { + "x_offset": self.x_offset, + "y_offset": self.y_offset, + "z_offset": self.z_offset, + "areal_rotation": self.areal_rotation_value, + "rotation_uom": self.areal_rotation_uom, + "z_is_up": self.z_increasing_downward, + } + + +# --------------------------------------------------------------------------- +# Private helpers +# --------------------------------------------------------------------------- + + +def _resolve_dor( + obj: Any, + workspace: Optional[EnergymlStorageInterface], +) -> Any: + """ + If *obj* looks like a ``DataObjectReference`` (DOR), resolve it to the + actual object via *workspace* and return it. Otherwise return *obj* as-is. + + Detection heuristic: the class name contains ``"reference"`` or ``"dor"`` + **and** the object has a ``uuid``/``uid`` attribute (i.e. it is a pointer, + not a value type). + """ + + if obj is None or workspace is None: + return obj + type_lower = type(obj).__name__.lower() + if "reference" not in type_lower and "dor" not in type_lower: + return obj # already a concrete object + uri = get_obj_uri(obj) + if uri: + resolved = workspace.get_object(uri) + if resolved is not None: + return resolved + return obj + + +def _uom_to_str(uom: Any) -> Optional[str]: + """ + Normalise a ``LengthUom`` / ``TimeUom`` enum value (or plain string) to a + plain lowercase string like ``"m"``, ``"ft"``, ``"s"``. + + Handles patterns like: + - ``LengthUom.M`` → ``"m"`` + - ``"LengthUom.ft"`` → ``"ft"`` + - ``"m"`` → ``"m"`` + """ + if uom is None: + return None + s = str(uom) + if "." in s: + s = s.split(".")[-1] + return s.strip() or None + + +def _extract_abstract_projected_crs(abstract_projected_crs: Any) -> dict: + """ + Extract details from an ``AbstractProjectedCrs`` concrete instance. + + Returns a dict with keys: ``epsg_code``, ``wkt``, ``unknown``. + """ + result: dict = {"epsg_code": None, "wkt": None, "unknown": None} + if abstract_projected_crs is None: + return result + + type_name = type(abstract_projected_crs).__name__.lower() + + if "epsg" in type_name: + result["epsg_code"] = getattr(abstract_projected_crs, "epsg_code", None) + elif "wkt" in type_name: + result["wkt"] = getattr(abstract_projected_crs, "well_known_text", None) + elif "unknown" in type_name: + result["unknown"] = getattr(abstract_projected_crs, "unknown", None) + + # Fallback: generic attribute search + if result["epsg_code"] is None: + result["epsg_code"] = get_object_attribute_rgx(abstract_projected_crs, "[Ee]psg[_]?[Cc]ode") + + return result + + +def _extract_projected_crs_details(projected_crs_obj: Any) -> dict: + """ + Extract details from a ``ProjectedCrs`` (v2.2 EML) or from an + ``AbstractProjectedCrs`` inline object (v2.0.1). + + Returns a dict with keys: ``epsg_code``, ``wkt``, ``unknown``, ``uom``, + ``axis_order``. + """ + result: dict = { + "epsg_code": None, + "wkt": None, + "unknown": None, + "uom": None, + "axis_order": None, + } + if projected_crs_obj is None: + return result + + # UOM — may be an XML attribute on ProjectedCrs (v2.2 only; absent on v2.0.1 abstract subtypes) + result["uom"] = _uom_to_str(getattr(projected_crs_obj, "uom", None)) + + # Axis order (v2.2 only) + axis_order_raw = getattr(projected_crs_obj, "axis_order", None) + if axis_order_raw is not None: + ao = str(axis_order_raw) + if "." in ao: + ao = ao.split(".")[-1] + result["axis_order"] = ao.replace("_", " ").lower() + + # EPSG from direct attribute + epsg = getattr(projected_crs_obj, "epsg_code", None) + if epsg is not None: + result["epsg_code"] = epsg + return result + + # Navigate into AbstractProjectedCrs choice (v2.2 encapsulation pattern) + abstract_crs = getattr(projected_crs_obj, "abstract_projected_crs", None) + if abstract_crs is not None: + details = _extract_abstract_projected_crs(abstract_crs) + result.update({k: v for k, v in details.items() if v is not None}) + + return result + + +def _extract_abstract_vertical_crs(abstract_vertical_crs: Any) -> dict: + """ + Extract details from an ``AbstractVerticalCrs`` concrete instance. + + Returns a dict with keys: ``epsg_code``, ``wkt``, ``unknown``. + """ + result: dict = {"epsg_code": None, "wkt": None, "unknown": None} + if abstract_vertical_crs is None: + return result + + type_name = type(abstract_vertical_crs).__name__.lower() + + if "epsg" in type_name: + result["epsg_code"] = getattr(abstract_vertical_crs, "epsg_code", None) + elif "wkt" in type_name: + result["wkt"] = getattr(abstract_vertical_crs, "well_known_text", None) + elif "unknown" in type_name: + result["unknown"] = getattr(abstract_vertical_crs, "unknown", None) + + if result["epsg_code"] is None: + result["epsg_code"] = get_object_attribute_rgx(abstract_vertical_crs, "[Ee]psg[_]?[Cc]ode") + + return result + + +def _extract_vertical_crs_details(vertical_crs_obj: Any) -> dict: + """ + Extract details from a ``VerticalCrs`` (v2.2 EML) or from an + ``AbstractVerticalCrs`` inline object (v2.0.1). + + Returns a dict with keys: ``epsg_code``, ``wkt``, ``unknown``, ``uom``, + ``z_increasing_downward``. + + ``z_increasing_downward`` is ``None`` when the sub-object carries no + explicit direction information (e.g. ``VerticalUnknownCrs``). Callers + **must not** override a parent-level ``ZIncreasingDownward`` when this + value is ``None``. + """ + logging.debug( + f"Extracting vertical CRS details from object of type {type(vertical_crs_obj).__name__} with URI {get_obj_uri(vertical_crs_obj)}" + ) + result: dict = { + "epsg_code": None, + "wkt": None, + "unknown": None, + "uom": None, + "z_increasing_downward": None, # None = not explicitly set by this sub-object + } + if vertical_crs_obj is None: + return result + + # UOM (field exists on VerticalCrs v2.2; absent on v2.0.1 abstract subtypes) + result["uom"] = _uom_to_str(getattr(vertical_crs_obj, "uom", None)) + + # Direction (VerticalCrs v2.2 has a top-level direction field) + direction = getattr(vertical_crs_obj, "direction", None) + if direction is not None: + d = str(direction) + if "." in d: + d = d.split(".")[-1] + result["z_increasing_downward"] = d.lower() == "down" + + # EPSG from direct attribute + epsg = getattr(vertical_crs_obj, "epsg_code", None) + if epsg is not None: + result["epsg_code"] = epsg + return result + + # Navigate into AbstractVerticalCrs choice + abstract_crs = getattr(vertical_crs_obj, "abstract_vertical_crs", None) + if abstract_crs is not None: + details = _extract_abstract_vertical_crs(abstract_crs) + result.update({k: v for k, v in details.items() if v is not None}) + + return result + + +def _extract_rotation(crs_obj: Any) -> tuple[float, str]: + """ + Extract the areal rotation / azimuth (value, uom) from *any* CRS object. + + Handles both v2.0.1 ``ArealRotation.value/uom`` and v2.2 + ``Azimuth.value/uom`` styles. + + Returns ``(0.0, "rad")`` if no rotation field is found. + """ + # v2.2 style (azimuth.value / azimuth.uom) + azimuth_value = get_object_attribute_rgx(crs_obj, "[Aa]zimuth.value") + if azimuth_value is not None: + azimuth_uom = _uom_to_str(get_object_attribute_rgx(crs_obj, "[Aa]zimuth.uom")) or "rad" + try: + return float(azimuth_value), azimuth_uom + except (ValueError, TypeError): + pass + + # v2.0.1 style (areal_rotation.value / areal_rotation.uom) + rotation_value = get_object_attribute_rgx(crs_obj, "[Aa]real[_]?[Rr]otation.value") + if rotation_value is not None: + rotation_uom = _uom_to_str(get_object_attribute_rgx(crs_obj, "[Aa]real[_]?[Rr]otation.uom")) or "rad" + try: + return float(rotation_value), rotation_uom + except (ValueError, TypeError): + pass + + return 0.0, "rad" + + +# --------------------------------------------------------------------------- +# Branch extractors (one per top-level CRS type) +# --------------------------------------------------------------------------- + + +def _from_abstract_local3dcrs( + crs_obj: Any, + workspace: Optional[EnergymlStorageInterface] = None, +) -> CrsInfo: + """ + Handle ``AbstractLocal3dCrs`` and its concrete subclasses + (``ObjLocalDepth3DCrs``, ``ObjLocalTime3DCrs``) — **RESQML v2.0.1**. + + Although the RESQML v2.0.1 schema embeds most data inline, the + ``ProjectedCrs`` and ``VerticalCrs`` child elements can be + ``DataObjectReference`` values. *workspace* is used to resolve those + DORs when provided. + """ + type_name = type(crs_obj).__name__ + logging.debug(f"@_from_abstract_local3dcrs Extracting CRS info from {type_name} with URI {get_obj_uri(crs_obj)}") + + # --- Offsets ----------------------------------------------------------- + x_offset = 0.0 + y_offset = 0.0 + z_offset = 0.0 + try: + _x = get_object_attribute_no_verif(crs_obj, "xoffset") + _y = get_object_attribute_no_verif(crs_obj, "yoffset") + _z = get_object_attribute_no_verif(crs_obj, "zoffset") + x_offset = float(_x) if _x is not None else 0.0 + y_offset = float(_y) if _y is not None else 0.0 + z_offset = float(_z) if _z is not None else 0.0 + except (ValueError, TypeError, AttributeError) as exc: + logger.debug("v2.0.1 offset read error: %s", exc) + + # --- Rotation ---------------------------------------------------------- + areal_rotation_value, areal_rotation_uom = _extract_rotation(crs_obj) + + # --- Z direction ------------------------------------------------------- + z_increasing_downward: bool = False + zid_raw = get_object_attribute_no_verif(crs_obj, "zincreasing_downward") + logging.debug(f"v2.0.1 ZIncreasingDownward raw value: {zid_raw}") + if zid_raw is not None: + if isinstance(zid_raw, bool): + z_increasing_downward = zid_raw + else: + z_increasing_downward = str(zid_raw).lower() in ("true", "1", "yes") + + # --- Projected UOM ----------------------------------------------------- + projected_uom: Optional[str] = _uom_to_str(get_object_attribute_no_verif(crs_obj, "projected_uom")) + + # --- Vertical UOM (length or time) ------------------------------------ + vertical_uom: Optional[str] = _uom_to_str(get_object_attribute_no_verif(crs_obj, "vertical_uom")) + if vertical_uom is None: + # time_uom only present on LocalTime3dCrs + vertical_uom = _uom_to_str(getattr(crs_obj, "time_uom", None)) + + # --- Axis order -------------------------------------------------------- + axis_order_raw = get_object_attribute_no_verif(crs_obj, "projected_axis_order") + projected_axis_order: Optional[str] = None + if axis_order_raw is not None: + ao = str(axis_order_raw) + if "." in ao: + ao = ao.split(".")[-1] + projected_axis_order = ao.replace("_", " ").lower() + + # --- Projected CRS ----------------------------------------------------- + projected_crs_obj = _resolve_dor(get_object_attribute_no_verif(crs_obj, "projected_crs"), workspace) + projected_details = _extract_projected_crs_details(projected_crs_obj) + + # Projected UOM from inline ProjectedCrs takes precedence if present + if projected_details.get("uom"): + projected_uom = projected_details["uom"] + if projected_details.get("axis_order"): + projected_axis_order = projected_details["axis_order"] + + # --- Vertical CRS ------------------------------------------------------ + vertical_crs_obj = _resolve_dor(get_object_attribute_no_verif(crs_obj, "vertical_crs"), workspace) + vertical_details = _extract_vertical_crs_details(vertical_crs_obj) + + # Direction from VerticalCrs overrides the top-level ZIncreasingDownward + # only when explicitly set. + logging.debug("z_increasing_downward before vertical CRS details: %s", z_increasing_downward) + logging.debug( + f"Vertical CRS details: {vertical_details} -- vertical_crs_obj type: {type(vertical_crs_obj).__name__ if vertical_crs_obj else 'None'}" + ) + if vertical_crs_obj is not None and vertical_details.get("z_increasing_downward") is not None: + z_increasing_downward = vertical_details["z_increasing_downward"] + if vertical_details.get("uom"): + vertical_uom = vertical_details["uom"] + + logging.debug("z_increasing_downward after vertical CRS details: %s", z_increasing_downward) + + return CrsInfo( + x_offset=x_offset, + y_offset=y_offset, + z_offset=z_offset, + projected_epsg_code=projected_details.get("epsg_code"), + projected_uom=projected_uom, + projected_axis_order=projected_axis_order, + projected_wkt=projected_details.get("wkt"), + projected_unknown=projected_details.get("unknown"), + vertical_epsg_code=vertical_details.get("epsg_code"), + vertical_uom=vertical_uom, + z_increasing_downward=z_increasing_downward, + vertical_wkt=vertical_details.get("wkt"), + vertical_unknown=vertical_details.get("unknown"), + areal_rotation_value=areal_rotation_value, + areal_rotation_uom=areal_rotation_uom, + source_type=type_name, + ) + + +def _from_local_engineering2d_crs( + crs_obj: Any, + workspace: Optional[EnergymlStorageInterface] = None, +) -> CrsInfo: + """ + Handle ``LocalEngineering2dCrs`` — **EML v2.3 / RESQML v2.2**. + + Contains: XY offsets, azimuth, ``ProjectedCrs`` DOR, + ``HorizontalAxes.ProjectedUom``. + Does **not** contain Z offset or vertical CRS — those live in the + enclosing ``LocalEngineeringCompoundCrs``. + + *workspace* is used to resolve the ``origin_projected_crs`` DOR. + """ + type_name = type(crs_obj).__name__ + + # --- XY offsets -------------------------------------------------------- + x_offset = 0.0 + y_offset = 0.0 + try: + _x = get_object_attribute_no_verif(crs_obj, "origin_projected_coordinate1") + _y = get_object_attribute_no_verif(crs_obj, "origin_projected_coordinate2") + x_offset = float(_x) if _x is not None else 0.0 + y_offset = float(_y) if _y is not None else 0.0 + except (ValueError, TypeError, AttributeError) as exc: + logger.debug("LocalEngineering2dCrs offset read error: %s", exc) + + # --- Azimuth ----------------------------------------------------------- + areal_rotation_value, areal_rotation_uom = _extract_rotation(crs_obj) + + # --- Azimuth reference ------------------------------------------------- + azimuth_ref_raw = get_object_attribute_no_verif(crs_obj, "azimuth_reference") + azimuth_reference: Optional[str] = None + if azimuth_ref_raw is not None: + ar = str(azimuth_ref_raw) + if "." in ar: + ar = ar.split(".")[-1] + azimuth_reference = ar.replace("_", " ").lower() + + # --- Horizontal UOM (HorizontalAxes.projected_uom or uom on ProjectedCrs) --- + projected_uom: Optional[str] = _uom_to_str(get_object_attribute(crs_obj, "horizontal_axes.projected_uom")) + + # --- ProjectedCrs — may be an inline object OR a DOR ------------------ + projected_crs_raw = get_object_attribute_no_verif(crs_obj, "origin_projected_crs") + projected_crs_obj = _resolve_dor(projected_crs_raw, workspace) + projected_details = _extract_projected_crs_details(projected_crs_obj) + + if projected_details.get("uom") and projected_uom is None: + projected_uom = projected_details["uom"] + + return CrsInfo( + x_offset=x_offset, + y_offset=y_offset, + z_offset=0.0, # Z lives in the compound CRS + projected_epsg_code=projected_details.get("epsg_code"), + projected_uom=projected_uom, + projected_axis_order=projected_details.get("axis_order"), + projected_wkt=projected_details.get("wkt"), + projected_unknown=projected_details.get("unknown"), + areal_rotation_value=areal_rotation_value, + areal_rotation_uom=areal_rotation_uom, + azimuth_reference=azimuth_reference, + source_type=type_name, + ) + + +def _from_vertical_crs(crs_obj: Any) -> CrsInfo: + """ + Handle a standalone ``VerticalCrs`` document object — **EML v2.3 / RESQML v2.2**. + + When the object carries no explicit direction (e.g. ``VerticalUnknownCrs``), + ``z_increasing_downward`` defaults to ``False``; the caller is responsible + for not blindly overriding a parent-level value in that case. + """ + type_name = type(crs_obj).__name__ + details = _extract_vertical_crs_details(crs_obj) + # Sentinel None means direction was not explicit — default to False for the standalone CrsInfo. + z_idc: bool = details["z_increasing_downward"] if details["z_increasing_downward"] is not None else False + return CrsInfo( + vertical_epsg_code=details.get("epsg_code"), + vertical_uom=details.get("uom"), + z_increasing_downward=z_idc, + vertical_wkt=details.get("wkt"), + vertical_unknown=details.get("unknown"), + source_type=type_name, + ) + + +def _from_local_engineering_compound_crs( + crs_obj: Any, + workspace: Optional[EnergymlStorageInterface], +) -> CrsInfo: + """ + Handle ``LocalEngineeringCompoundCrs`` — **EML v2.3 / RESQML v2.2**. + + Resolves: + - ``local_engineering2d_crs`` → DOR → ``LocalEngineering2dCrs`` + - ``vertical_crs`` → DOR (inherited from ``AbstractCompoundCrs``) → ``VerticalCrs`` + + When ``workspace`` is ``None``, only inline data (z offset, vertical axis + from the compound itself) can be populated. + """ + type_name = type(crs_obj).__name__ + + # --- Z offset (origin_vertical_coordinate) -------------------------------- + z_offset = 0.0 + try: + _z = get_object_attribute_no_verif(crs_obj, "origin_vertical_coordinate") + z_offset = float(_z) if _z is not None else 0.0 + except (ValueError, TypeError, AttributeError) as exc: + logger.debug("LocalEngineeringCompoundCrs z-offset read error: %s", exc) + + # --- Vertical axis (inline — gives direction + uom without workspace) -- + vert_axis_direction: Optional[str] = None + vert_axis_uom: Optional[str] = None + vert_axis_uom_raw = get_object_attribute(crs_obj, "vertical_axis.uom") + if vert_axis_uom_raw is not None: + vert_axis_uom = _uom_to_str(vert_axis_uom_raw) + vert_axis_dir_raw = get_object_attribute(crs_obj, "vertical_axis.direction") + if vert_axis_dir_raw is not None: + d = str(vert_axis_dir_raw) + if "." in d: + d = d.split(".")[-1] + vert_axis_direction = d.lower() + + z_increasing_downward: bool = vert_axis_direction == "down" if vert_axis_direction else False + + # --- Resolve LocalEngineering2dCrs via DOR ---------------------------- + horiz_info: Optional[CrsInfo] = None + horiz_dor = get_object_attribute_no_verif(crs_obj, "local_engineering2d_crs") + if horiz_dor is not None and workspace is not None: + horiz_uuid = get_obj_uuid(horiz_dor) + if horiz_uuid: + candidates = workspace.get_object_by_uuid(horiz_uuid) + if candidates: + horiz_info = _from_local_engineering2d_crs(candidates[0], workspace) + if horiz_info is None: + horiz_uri = get_obj_uri(horiz_dor) + if horiz_uri: + horiz_obj = workspace.get_object(horiz_uri) + if horiz_obj is not None: + horiz_info = _from_local_engineering2d_crs(horiz_obj, workspace) + elif horiz_dor is not None: + logger.warning( + "LocalEngineeringCompoundCrs: workspace is None — cannot resolve " + "LocalEngineering2dCrs DOR; horizontal info (offsets, rotation) will be missing." + ) + + # --- Resolve VerticalCrs via DOR (inherited AbstractCompoundCrs.vertical_crs) --- + vert_details_raw: Optional[dict] = None # raw dict, preserving None sentinel + vert_info: Optional[CrsInfo] = None + vert_dor = get_object_attribute_no_verif(crs_obj, "vertical_crs") + if vert_dor is not None and workspace is not None: + vert_uuid = get_obj_uuid(vert_dor) + if vert_uuid: + candidates = workspace.get_object_by_uuid(vert_uuid) + if candidates: + vert_details_raw = _extract_vertical_crs_details(candidates[0]) + vert_info = _from_vertical_crs(candidates[0]) + if vert_info is None: + vert_uri = get_obj_uri(vert_dor) + if vert_uri: + vert_obj = workspace.get_object(vert_uri) + if vert_obj is not None: + vert_details_raw = _extract_vertical_crs_details(vert_obj) + vert_info = _from_vertical_crs(vert_obj) + elif vert_dor is not None: + logger.warning( + "LocalEngineeringCompoundCrs: workspace is None — cannot resolve " + "VerticalCrs DOR; vertical info (EPSG, UOM) will be missing." + ) + + # --- Merge results ----------------------------------------------------- + return CrsInfo( + # XY offsets and rotation come from the 2D CRS + x_offset=horiz_info.x_offset if horiz_info else 0.0, + y_offset=horiz_info.y_offset if horiz_info else 0.0, + z_offset=z_offset, + projected_epsg_code=horiz_info.projected_epsg_code if horiz_info else None, + projected_uom=horiz_info.projected_uom if horiz_info else None, + projected_axis_order=horiz_info.projected_axis_order if horiz_info else None, + projected_wkt=horiz_info.projected_wkt if horiz_info else None, + projected_unknown=horiz_info.projected_unknown if horiz_info else None, + areal_rotation_value=horiz_info.areal_rotation_value if horiz_info else 0.0, + areal_rotation_uom=horiz_info.areal_rotation_uom if horiz_info else "rad", + azimuth_reference=horiz_info.azimuth_reference if horiz_info else None, + # Vertical info: prefer resolved VerticalCrs, but only override direction + # when the resolved CRS carries an explicit direction (not the None sentinel). + vertical_epsg_code=vert_info.vertical_epsg_code if vert_info else None, + vertical_uom=(vert_info.vertical_uom if vert_info else None) or vert_axis_uom, + z_increasing_downward=( + vert_info.z_increasing_downward + if vert_info and vert_details_raw is not None and vert_details_raw.get("z_increasing_downward") is not None + else z_increasing_downward + ), + vertical_wkt=vert_info.vertical_wkt if vert_info else None, + vertical_unknown=vert_info.vertical_unknown if vert_info else None, + source_type=type_name, + ) + + +# --------------------------------------------------------------------------- +# Geometry helpers +# --------------------------------------------------------------------------- + + +_NORTHING_FIRST_PATTERNS = ( + "northing easting", + "north east", + "north easting", + "northing east", + "latitude longitude", + "lat lon", + "lat long", +) + + +def apply_axis_order_swap( + points: np.ndarray, + axis_order: Optional[str], +) -> np.ndarray: + """ + Swap X and Y columns when the CRS axis order is northing-first. + + RESQML local offsets are always stored as (easting, northing), + but some projected CRS definitions (e.g. EPSG:4326, EPSG:27700) use + northing as the first axis. When ``axis_order`` indicates a + northing-first convention the columns 0 and 1 of *points* are swapped + so that column 0 is always easting and column 1 is always northing. + + Parameters + ---------- + points: + (N, 3) float64 array, **modified in-place**. + axis_order: + Normalised axis-order string from :class:`CrsInfo` (lower-case, + spaces instead of underscores), e.g. ``"northing easting"``. + ``None`` means "no swap needed". + + Returns + ------- + np.ndarray + The same array (in-place swap). + """ + if axis_order is None: + return points + ao_lower = axis_order.lower() + if any(ao_lower.startswith(p) for p in _NORTHING_FIRST_PATTERNS): + points[:, 0], points[:, 1] = points[:, 1].copy(), points[:, 0].copy() + return points + + +def apply_from_crs_info( + points: np.ndarray, + crs_info: "CrsInfo", + *, + inplace: bool = True, +) -> np.ndarray: + """ + Apply the full CRS transform described by *crs_info* to *points*. + + Transform pipeline (order matters): + + 1. **Areal rotation** (RESQML convention: *clockwise* angle) → + ``x' = x·cos θ + y·sin θ``, ``y' = -x·sin θ + y·cos θ`` + 2. **Translation** — add ``(x_offset, y_offset, z_offset)`` + 3. **Z-axis flip** — negate Z when the CRS *is* + z-increasing-downward (i.e. the local CRS stores depth as positive Z, + so we flip to z-up for a consistent elevation-positive system used by + most 3-D viewers). + 4. **Axis-order swap** — swap X/Y when :attr:`CrsInfo.projected_axis_order` + is northing-first. + + .. note:: + ``azimuth_reference`` values of ``"true north"`` or + ``"magnetic north"`` require an external correction + (meridian-convergence / magnetic-declination) that is not applied + here. A ``WARNING`` is emitted in those cases. + + Parameters + ---------- + points: + (N, 3) ``float64`` array of 3-D points in the local CRS. + crs_info: + Populated :class:`CrsInfo` DTO. + inplace: + When ``True`` (default) the rotation and translation are applied + to *points* directly. When ``False`` a copy is made first. + + Returns + ------- + np.ndarray + Transformed (N, 3) array. + """ + if not inplace: + points = points.copy() + + pts = points.astype(np.float64, copy=False) + + # --- 0. AzimuthReference warning --------------------------------------- + ref = (crs_info.azimuth_reference or "").lower() + if ref in ("true north", "magnetic north"): + logger.warning( + "apply_from_crs_info: azimuth_reference='%s' requires a meridian-" + "convergence / magnetic-declination correction that is NOT applied. " + "#TODO: implement once a correction source is available.", + crs_info.azimuth_reference, + ) + + # --- 1. Areal rotation (RESQML: clockwise) ---------------------------- + angle_rad = crs_info.areal_rotation_rad() + if angle_rad != 0.0: + cos_t = math.cos(angle_rad) + sin_t = math.sin(angle_rad) + x_orig = pts[:, 0].copy() + y_orig = pts[:, 1].copy() + # CW rotation: x' = x·cos + y·sin, y' = -x·sin + y·cos + pts[:, 0] = x_orig * cos_t + y_orig * sin_t + pts[:, 1] = -x_orig * sin_t + y_orig * cos_t + + # --- 2. Translation --------------------------------------------------- + pts[:, 0] += crs_info.x_offset + pts[:, 1] += crs_info.y_offset + pts[:, 2] += crs_info.z_offset + + # --- 3. Z-axis flip --------------------------------------------------- + # When z-increasing-downward the local CRS stores depth as positive Z + # (down = positive). Negate so the output uses the z-up (elevation- + # positive) convention expected by most 3-D viewers. + if crs_info.z_increasing_downward: + pts[:, 2] = -pts[:, 2] + + # --- 4. Axis-order swap ----------------------------------------------- + apply_axis_order_swap(pts, crs_info.projected_axis_order) + + if inplace: + points[:] = pts + return points + return pts + + +# --------------------------------------------------------------------------- +# Public factory +# --------------------------------------------------------------------------- + + +def extract_crs_info( + crs_obj: Any, + workspace: Optional[EnergymlStorageInterface] = None, +) -> CrsInfo: + """ + Extract all available CRS metadata from *any* energyml CRS object into a + version-neutral :class:`CrsInfo` DTO. + + Supported types (matched case-insensitively on the class name): + + **RESQML v2.0.1** + + - ``ObjLocalDepth3DCrs`` / ``LocalDepth3dCrs`` + - ``ObjLocalTime3DCrs`` / ``LocalTime3dCrs`` + - Any subclass of ``AbstractLocal3dCrs`` + + **EML v2.3 / RESQML v2.2** + + - ``LocalEngineeringCompoundCrs`` + - ``LocalEngineering2dCrs`` (also handled standalone) + - ``VerticalCrs`` (also handled standalone) + + Parameters + ---------- + crs_obj: + An energyml CRS data object. May be ``None`` — in that case a default + ``CrsInfo()`` is returned (all zeros / None). + workspace: + Optional storage interface used to resolve + ``DataObjectReference`` links in v2.2 compound CRS objects. + When ``None``, only inline data is extracted (partial result). + + Returns + ------- + CrsInfo + Populated DTO. Never raises — errors are logged at DEBUG / WARNING + level and graceful defaults are returned. + """ + if crs_obj is None: + return CrsInfo() + + # Transparently resolve DataObjectReference inputs (e.g. from get_datum_information) + # so callers do not have to resolve DORs before calling this function. + if workspace is not None: + crs_obj = _resolve_dor(crs_obj, workspace) + if crs_obj is None: + return CrsInfo() + + type_name_lower = type(crs_obj).__name__.lower() + + # ------------------------------------------------------------------ + # v2.2 / EML v2.3 types + # ------------------------------------------------------------------ + if "localengineeringcompoundcrs" in type_name_lower: + return _from_local_engineering_compound_crs(crs_obj, workspace) + + if "localengineering2dcrs" in type_name_lower or "localengineering2" in type_name_lower: + return _from_local_engineering2d_crs(crs_obj, workspace) + + if type_name_lower == "verticalcrs": + return _from_vertical_crs(crs_obj) + + # ------------------------------------------------------------------ + # v2.0.1 types (LocalDepth3dCrs, LocalTime3dCrs, AbstractLocal3dCrs) + # ------------------------------------------------------------------ + if any(kw in type_name_lower for kw in ("localdepth3dcrs", "localtime3dcrs", "abstractlocal3dcrs", "local3dcrs")): + return _from_abstract_local3dcrs(crs_obj, workspace) + + # ------------------------------------------------------------------ + # Heuristic fallback: inspect the object's attributes to guess version + # ------------------------------------------------------------------ + # v2.0.1 pattern: has XOffset / YOffset + if get_object_attribute_rgx(crs_obj, "[Xx][Oo]ffset") is not None: + logger.debug( + "extract_crs_info: unrecognised type '%s' — treating as AbstractLocal3dCrs (v2.0.1 pattern).", + type(crs_obj).__name__, + ) + return _from_abstract_local3dcrs(crs_obj, workspace) + + # v2.2 pattern: has OriginProjectedCoordinate1 (LocalEngineering2dCrs) + if get_object_attribute_rgx(crs_obj, "[Oo]rigin[Pp]rojected[Cc]oordinate1") is not None: + logger.debug( + "extract_crs_info: unrecognised type '%s' — treating as LocalEngineering2dCrs (v2.2 pattern).", + type(crs_obj).__name__, + ) + return _from_local_engineering2d_crs(crs_obj, workspace) + + # v2.2 pattern: has LocalEngineering2dCrs DOR → compound + if get_object_attribute_rgx(crs_obj, "[Ll]ocal[Ee]ngineering2[dD][Cc]rs") is not None: + logger.debug( + "extract_crs_info: unrecognised type '%s' — treating as LocalEngineeringCompoundCrs (v2.2 pattern).", + type(crs_obj).__name__, + ) + return _from_local_engineering_compound_crs(crs_obj, workspace) + + logger.warning( + "extract_crs_info: unsupported CRS type '%s' — returning default CrsInfo.", + type(crs_obj).__name__, + ) + return CrsInfo(source_type=type(crs_obj).__name__) + + +__all__ = [ + "CrsInfo", + "extract_crs_info", + "apply_from_crs_info", + "apply_axis_order_swap", +] diff --git a/energyml-utils/src/energyml/utils/data/datasets_io.py b/energyml-utils/src/energyml/utils/data/datasets_io.py index d899015..7ba2834 100644 --- a/energyml-utils/src/energyml/utils/data/datasets_io.py +++ b/energyml-utils/src/energyml/utils/data/datasets_io.py @@ -51,6 +51,22 @@ except Exception: __PARQUET_MODULE_EXISTS__ = False +try: + import lasio + + __LASIO_MODULE_EXISTS__ = True +except Exception: + lasio = None + __LASIO_MODULE_EXISTS__ = False + +try: + import segyio + + __SEGYIO_MODULE_EXISTS__ = True +except Exception: + segyio = None + __SEGYIO_MODULE_EXISTS__ = False + # HDF5 if __H5PY_MODULE_EXISTS__: @@ -142,7 +158,6 @@ def extract_h5_datasets( @dataclass class HDF5FileWriter: - def write_array( self, target: Union[str, BytesIO, bytes, "h5py.File"], @@ -193,7 +208,6 @@ def extract_h5_datasets( raise MissingExtraInstallation(extra_name="hdf5") class HDF5FileWriter: - def write_array( self, target: Union[str, BytesIO, bytes, Any], @@ -580,15 +594,14 @@ def read_dataset( mimetype: Optional[str] = "application/x-hdf5", ) -> Any: mimetype = (mimetype or "").lower() - file_reader = HDF5FileReader() # default is hdf5 if "parquet" in mimetype or ( isinstance(source, str) and (source.lower().endswith(".parquet") or source.lower().endswith(".pqt")) ): file_reader = ParquetFileReader() - elif "csv" in mimetype or ( - isinstance(source, str) and (source.lower().endswith(".csv") or source.lower().endswith(".dat")) - ): + elif "csv" in mimetype or (isinstance(source, str) and (source.lower().endswith(".csv"))): file_reader = CSVFileReader() + else: + file_reader = HDF5FileReader() # default is hdf5 return file_reader.read_array(source, path_in_external_file) @@ -601,7 +614,7 @@ def read_external_dataset_array( ): if additional_sources is None: additional_sources = [] - result_array = [] + result_array = None for path_in_obj, path_in_external in get_path_in_external_with_path(energyml_array): succeed = False @@ -615,10 +628,13 @@ def read_external_dataset_array( ) for s in sources: try: - # TODO: take care of the "Counts" and "Starts" list in ExternalDataArrayPart to fill array correctly - result_array = result_array + read_dataset( - source=s, path_in_external_file=path_in_external, mimetype=mimetype - ) + if result_array is None: + result_array = read_dataset(source=s, path_in_external_file=path_in_external, mimetype=mimetype) + else: + # TODO: take care of the "Counts" and "Starts" list in ExternalDataArrayPart to fill array correctly + result_array = result_array + read_dataset( + source=s, path_in_external_file=path_in_external, mimetype=mimetype + ) succeed = True break # stop after the first read success except MissingExtraInstallation as mei: @@ -706,3 +722,1119 @@ def get_proxy_uri_for_path_in_external(obj: Any, dataspace_name_or_uri: Union[st else: logging.debug(f"No datasets found in object {str(get_obj_uri(obj))}") return uri_path_map + + +# =========================================================================================== +# FILE CACHE MANAGER AND HANDLER REGISTRY +# =========================================================================================== + + +from typing import Callable +from energyml.utils.data.model import ExternalArrayHandler + + +class FileHandlerRegistry: + """ + Global registry that maps file extensions to handler classes. + + This allows the system to automatically select the correct handler + based on file extension without hardcoding dependencies. + + Usage: + registry = FileHandlerRegistry() + handler = registry.get_handler_for_file("data.h5") + if handler: + array = handler.read_array("data.h5", "/dataset/path") + """ + + def __init__(self, max_open_files: int = 3): + self._handlers: Dict[str, Callable[[], ExternalArrayHandler]] = {} + self._register_default_handlers(max_open_files) + + def _register_default_handlers(self, max_open_files: int) -> None: + """Register all available handlers based on installed dependencies.""" + # HDF5 Handler + if __H5PY_MODULE_EXISTS__: + self.register_handler([".h5", ".hdf5"], lambda: HDF5ArrayHandler()) # dat for Galaxy compatibility + else: + self.register_handler([".h5", ".hdf5"], lambda: MockHDF5ArrayHandler()) # dat for Galaxy compatibility + + # Parquet Handler + if __PARQUET_MODULE_EXISTS__: + self.register_handler([".parquet", ".pq"], lambda: ParquetArrayHandler()) + else: + self.register_handler([".parquet", ".pq"], lambda: MockParquetArrayHandler()) + + # CSV Handler - always available (uses Python's csv module) + if __CSV_MODULE_EXISTS__: + self.register_handler([".csv", ".txt"], lambda: CSVArrayHandler()) + + # LAS Handler + if __LASIO_MODULE_EXISTS__: + self.register_handler([".las"], lambda: LASArrayHandler()) + else: + self.register_handler([".las"], lambda: MockLASArrayHandler()) + + # SEG-Y Handler + if __SEGYIO_MODULE_EXISTS__: + self.register_handler([".sgy", ".segy"], lambda: SEGYArrayHandler()) + else: + self.register_handler([".sgy", ".segy"], lambda: MockSEGYArrayHandler()) + + def register_handler(self, extensions: List[str], handler_factory: Callable[[], ExternalArrayHandler]) -> None: + """ + Register a handler factory for given file extensions. + + Args: + extensions: List of file extensions (with leading dot, e.g., ['.h5', '.hdf5']) + handler_factory: Callable that returns a new handler instance + """ + for ext in extensions: + ext_lower = ext.lower() if ext.startswith(".") else "." + ext.lower() + self._handlers[ext_lower] = handler_factory + + def get_handler_for_file(self, file_path: str) -> Optional[ExternalArrayHandler]: + """ + Get appropriate handler for a file based on its extension. + + Args: + file_path: Path to the file + + Returns: + Handler instance, or h5 handler if extension not found but h5 handler is available and not mock, else None + """ + ext = os.path.splitext(file_path)[1].lower() + + if ext in self._handlers: + return self._handlers[ext]() + + # search for h5 handler if not mock and return it by default + if ".h5" in self._handlers: + h = self._handlers[".h5"]() + if "mock" not in h.__class__.__name__.lower(): + return self._handlers[".h5"]() + return None + + def supports_extension(self, extension: str) -> bool: + """ + Check if a handler is registered for the given extension. + + Args: + extension: File extension (with or without leading dot) + + Returns: + True if a handler is registered + """ + ext_lower = extension.lower() if extension.startswith(".") else "." + extension.lower() + return ext_lower in self._handlers + + +# Global registry instance +_GLOBAL_HANDLER_REGISTRY = FileHandlerRegistry() + + +def get_handler_registry() -> FileHandlerRegistry: + """Get the global file handler registry.""" + return _GLOBAL_HANDLER_REGISTRY + + +# =========================================================================================== +# CONCRETE HANDLER IMPLEMENTATIONS +# =========================================================================================== + +# HDF5 Handler +if __H5PY_MODULE_EXISTS__: + + class HDF5ArrayHandler(ExternalArrayHandler): + """Handler for HDF5 files (.h5, .hdf5).""" + + def __init__(self, max_open_files: int = 3): + super().__init__(max_open_files=max_open_files) + + def open_file_no_cache(self, file_path: str, mode: str = "r") -> Optional[Any]: + """Open an HDF5 file without using the cache.""" + try: + return h5py.File(file_path, mode) # type: ignore + except Exception as e: + logging.debug(f"Failed to open HDF5 file {file_path}: {e}") + return None + + def read_array( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[np.ndarray]: + """Read array from HDF5 file with optional sub-selection.""" + if isinstance(source, h5py.File): # type: ignore + if path_in_external_file: + d_group = source[path_in_external_file] + full_array = d_group[()] # type: ignore + # Apply sub-selection if specified + if start_indices is not None and counts is not None: + slices = tuple(slice(start, start + count) for start, count in zip(start_indices, counts)) + return full_array[slices] + return full_array + return None + else: + with self.file_cache.get_or_open(source, self, "r") as f: # type: ignore + return self.read_array(f, path_in_external_file, start_indices, counts) + + def read_array_view( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[np.ndarray]: + """Read array from HDF5 with best-effort zero-copy semantics. + + For contiguous, uncompressed datasets the returned array is backed + by the memory-mapped file buffer (no copy). For chunked or + compressed datasets h5py transparently falls back to a copy, but + sub-selection is done by h5py in C before the data reaches Python + (avoids loading the full dataset then slicing in Python). + + The caller **must not mutate** the returned array. + """ + if isinstance(source, h5py.File): # type: ignore + if not path_in_external_file: + return None + d_group = source[path_in_external_file] + if start_indices is not None and counts is not None: + # h5py reads only the required chunks/slabs from disk + slices = tuple(slice(start, start + count) for start, count in zip(start_indices, counts)) + return d_group[slices] # type: ignore + # np.array with copy=False returns a view for contiguous datasets + # Note: copy= kwarg on np.asarray requires numpy >=2.0; + # np.array(x, copy=False) works on all numpy versions. + return np.array(d_group, copy=False) # type: ignore + else: + with self.file_cache.get_or_open(source, self, "r") as f: # type: ignore + return self.read_array_view(f, path_in_external_file, start_indices, counts) + + def write_array( + self, + target: Union[str, BytesIO, Any], + array: Union[list, np.ndarray], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + **kwargs, + ) -> bool: + """Write array to HDF5 file with optional offset.""" + if not path_in_external_file: + return False + + if isinstance(array, list): + array = np.asarray(array) + + dtype = kwargs.get("dtype") + if dtype is not None and not isinstance(dtype, np.dtype): + dtype = np.dtype(dtype) + + try: + if isinstance(target, h5py.File): # type: ignore + if isinstance(array, np.ndarray) and array.dtype == "O": + array = np.asarray([s.encode() if isinstance(s, str) else s for s in array]) + np.void(array) + + # Handle partial writes if start_indices provided + if start_indices is not None and path_in_external_file in target: + dset = target[path_in_external_file] + slices = tuple(slice(start, start + dim) for start, dim in zip(start_indices, array.shape)) + dset[slices] = array + else: + dset = target.create_dataset(path_in_external_file, array.shape, dtype or array.dtype) + dset[()] = array + else: + # with self.file_cache.get_or_open(target, self, "a") as f: # type: ignore + # return self.write_array(f, array, path_in_external_file, start_indices, **kwargs) + return self.write_array( + self.file_cache.get_or_open(target, self, "a"), + array, + path_in_external_file, + start_indices, + **kwargs, + ) + + return True + except Exception as e: + logging.error(f"Failed to write array to HDF5: {e}") + return False + + def get_array_metadata( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[Union[dict, List[dict]]]: + """Get metadata for HDF5 datasets with optional sub-selection.""" + try: + if isinstance(source, h5py.File): # type: ignore + if path_in_external_file: + dset = source[path_in_external_file] + shape = list(dset.shape) + size = dset.size + + # Adjust shape and size for sub-selection + if start_indices is not None and counts is not None: + shape = counts + size = int(np.prod(counts)) + + return { + "path": path_in_external_file, + "dtype": str(dset.dtype), + "shape": shape, + "size": size, + } + else: + # List all datasets + datasets = h5_list_datasets(source) + return [self.get_array_metadata(source, ds, start_indices, counts) for ds in datasets] + else: + # with self.file_cache.get_or_open(source, self, "r") as f: # type: ignore + # return self.get_array_metadata(f, path_in_external_file, start_indices, counts) + return self.get_array_metadata( + self.file_cache.get_or_open(source, self, "r"), path_in_external_file, start_indices, counts + ) + except Exception as e: + logging.debug(f"Failed to get HDF5 metadata: {e}") + return None + + def list_arrays(self, source: Union[BytesIO, str, Any]) -> List[str]: + """List all datasets in HDF5 file.""" + return h5_list_datasets(source) + + def can_handle_file(self, file_path: str) -> bool: + """Check if this handler can process the file.""" + ext = os.path.splitext(file_path)[1].lower() + return ext in [".h5", ".hdf5"] # dat for Galaxy compatibility + +else: + + class MockHDF5ArrayHandler(ExternalArrayHandler): + """Mock handler when h5py is not installed.""" + + def __init__(self, max_open_files: int = 3): + super().__init__(max_open_files=max_open_files) + + def open_file_no_cache(self, file_path: str, mode: str = "r") -> Optional[Any]: + """Open an HDF5 file without using the cache.""" + return None + + def read_array( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[np.ndarray]: + raise MissingExtraInstallation(extra_name="hdf5") + + def write_array( + self, + target: Union[str, BytesIO, Any], + array: Union[list, np.ndarray], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + **kwargs, + ) -> bool: + raise MissingExtraInstallation(extra_name="hdf5") + + def get_array_metadata( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[Union[dict, List[dict]]]: + raise MissingExtraInstallation(extra_name="hdf5") + + def list_arrays(self, source: Union[BytesIO, str, Any]) -> List[str]: + raise MissingExtraInstallation(extra_name="hdf5") + + def can_handle_file(self, file_path: str) -> bool: + return os.path.splitext(file_path)[1].lower() in [".h5", ".hdf5"] # dat for Galaxy compatibility + + # Alias so the public name is always importable + HDF5ArrayHandler = MockHDF5ArrayHandler + + +# Parquet Handler +if __PARQUET_MODULE_EXISTS__: + + class ParquetArrayHandler(ExternalArrayHandler): + """Handler for Parquet files (.parquet, .pq).""" + + def __init__(self, max_open_files: int = 3): + super().__init__(max_open_files=max_open_files) + + def open_file_no_cache(self, file_path: str, mode: str = "r") -> Optional[Any]: + """Open a Parquet file without using the cache.""" + try: + return pq.ParquetFile(file_path) # type: ignore + except Exception as e: + logging.error(f"Failed to open Parquet file {file_path}: {e}") + return None + + def read_array( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[np.ndarray]: + """Read array from Parquet file with optional sub-selection.""" + if isinstance(source, bytes): + source = pa.BufferReader(source) + + table = pq.read_table(source) + + if path_in_external_file: + array = np.array(table[path_in_external_file]) + else: + # Return all columns as 2D array + array = table.to_pandas().values + + # Apply sub-selection if specified + if array is not None and start_indices is not None and counts is not None: + slices = tuple(slice(start, start + count) for start, count in zip(start_indices, counts)) + return array[slices] + return array + + def write_array( + self, + target: Union[str, BytesIO, Any], + array: Union[list, np.ndarray], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + **kwargs, + ) -> bool: + """Write array to Parquet file.""" + column_titles = kwargs.get("column_titles") + + try: + # Convert to numpy array if needed + if not isinstance(array, np.ndarray): + array = np.array(array) + + # Handle 2D arrays properly: rows as rows, columns as columns + if array.ndim == 2: + # Create DataFrame where each column is a dimension + if column_titles is None: + column_titles = [str(i) for i in range(array.shape[1])] + array_as_pd_df = pd.DataFrame(array, columns=column_titles) + elif array.ndim == 1: + # 1D array becomes a single column + col_name = column_titles[0] if column_titles else "0" + array_as_pd_df = pd.DataFrame({col_name: array}) + else: + # For higher dimensions, flatten or handle as needed + logging.warning(f"Parquet writer received {array.ndim}D array, flattening to 2D") + array_2d = array.reshape(array.shape[0], -1) + if column_titles is None: + column_titles = [str(i) for i in range(array_2d.shape[1])] + array_as_pd_df = pd.DataFrame(array_2d, columns=column_titles) + + pq.write_table( + pa.Table.from_pandas(array_as_pd_df), + target, + version="2.6", + compression="snappy", + ) + return True + except Exception as e: + logging.error(f"Failed to write array to Parquet: {e}") + return False + + def get_array_metadata( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[Union[dict, List[dict]]]: + """Get metadata for Parquet columns with optional sub-selection.""" + try: + if isinstance(source, bytes): + source = pa.BufferReader(source) + + metadata = pq.read_metadata(source) + schema = pq.read_schema(source) + + if path_in_external_file: + # Get specific column metadata + col_idx = schema.get_field_index(path_in_external_file) + if col_idx >= 0: + field = schema.field(col_idx) + shape = [metadata.num_rows] + size = metadata.num_rows + + # Adjust for sub-selection + if start_indices is not None and counts is not None: + shape = counts + size = int(np.prod(counts)) + + return { + "path": path_in_external_file, + "dtype": str(field.type), + "shape": shape, + "size": size, + } + else: + # Get all columns + return [self.get_array_metadata(source, field.name, start_indices, counts) for field in schema] + except Exception as e: + logging.debug(f"Failed to get Parquet metadata: {e}") + return None + + def list_arrays(self, source: Union[BytesIO, str, Any]) -> List[str]: + """List all columns in Parquet file.""" + try: + if isinstance(source, bytes): + source = pa.BufferReader(source) + schema = pq.read_schema(source) + return [field.name for field in schema] + except Exception: + return [] + + def can_handle_file(self, file_path: str) -> bool: + """Check if this handler can process the file.""" + ext = os.path.splitext(file_path)[1].lower() + return ext in [".parquet", ".pq"] + +else: + + class MockParquetArrayHandler(ExternalArrayHandler): + """Mock handler when parquet libraries are not installed.""" + + def __init__(self, max_open_files: int = 3): + super().__init__(max_open_files=max_open_files) + + def open_file_no_cache(self, file_path: str, mode: str = "r") -> Optional[Any]: + """Open a Parquet file without using the cache.""" + return None + + def read_array( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[np.ndarray]: + raise MissingExtraInstallation(extra_name="parquet") + + def write_array( + self, + target: Union[str, BytesIO, Any], + array: Union[list, np.ndarray], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + **kwargs, + ) -> bool: + raise MissingExtraInstallation(extra_name="parquet") + + def get_array_metadata( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[Union[dict, List[dict]]]: + raise MissingExtraInstallation(extra_name="parquet") + + def list_arrays(self, source: Union[BytesIO, str, Any]) -> List[str]: + raise MissingExtraInstallation(extra_name="parquet") + + def can_handle_file(self, file_path: str) -> bool: + return os.path.splitext(file_path)[1].lower() in [".parquet", ".pq"] + + # Alias so the public name is always importable + ParquetArrayHandler = MockParquetArrayHandler + + +# CSV Handler +if __CSV_MODULE_EXISTS__: + + class CSVArrayHandler(ExternalArrayHandler): + """Handler for CSV files (.csv, .txt).""" + + def __init__(self, max_open_files: int = 3): + super().__init__(max_open_files=max_open_files) + + def open_file_no_cache(self, file_path: str, mode: str = "r") -> Optional[Any]: + """Open a CSV file without using the cache.""" + try: + return open(file_path, mode) + except Exception as e: + logging.error(f"Failed to open CSV file {file_path}: {e}") + return None + + def read_array( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[np.ndarray]: + """Read array from CSV file with optional sub-selection.""" + # For CSV, path_in_external_file can be column name or index + # This is a simplified implementation + try: + if isinstance(source, str): + data = np.genfromtxt(source, delimiter=",") + else: + data = np.genfromtxt(source, delimiter=",") + + # Apply sub-selection if specified + if data is not None and start_indices is not None and counts is not None: + slices = tuple(slice(start, start + count) for start, count in zip(start_indices, counts)) + return data[slices] + return data + except Exception as e: + logging.debug(f"Failed to read CSV: {e}") + return None + + def write_array( + self, + target: Union[str, BytesIO, Any], + array: Union[list, np.ndarray], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + **kwargs, + ) -> bool: + """Write array to CSV file.""" + try: + if isinstance(array, list): + array = np.asarray(array) + np.savetxt(target, array, delimiter=",") + return True + except Exception as e: + logging.error(f"Failed to write CSV: {e}") + return False + + def get_array_metadata( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[Union[dict, List[dict]]]: + """Get metadata for CSV file with optional sub-selection.""" + try: + data = self.read_array(source, path_in_external_file, start_indices, counts) + if data is not None: + return { + "path": path_in_external_file or "", + "dtype": str(data.dtype), + "shape": list(data.shape), + "size": data.size, + } + except Exception as e: + logging.debug(f"Failed to get CSV metadata: {e}") + return None + + def list_arrays(self, source: Union[BytesIO, str, Any]) -> List[str]: + """CSV files don't have named datasets.""" + return [] + + def can_handle_file(self, file_path: str) -> bool: + """Check if this handler can process the file.""" + ext = os.path.splitext(file_path)[1].lower() + return ext in [".csv", ".txt"] + + +# LAS Handler +if __LASIO_MODULE_EXISTS__: + + class LASArrayHandler(ExternalArrayHandler): + """Handler for LAS (Log ASCII Standard) files (.las).""" + + def __init__(self, max_open_files: int = 3): + super().__init__(max_open_files=max_open_files) + + def open_file_no_cache(self, file_path: str, mode: str = "r") -> Optional[Any]: + """Open a LAS file without using the cache.""" + try: + return lasio.read(file_path) # type: ignore + except Exception as e: + logging.error(f"Failed to open LAS file {file_path}: {e}") + return None + + def read_array( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[np.ndarray]: + """ + Read array from LAS file. + + Args: + source: Path to LAS file or BytesIO object + path_in_external_file: Comma-separated list of mnemonics to read from ~A block + start_indices: Starting index for each dimension (optional) + counts: Number of elements to read for each dimension (optional) + + Returns: + NumPy array with requested curves, or None if reading failed + """ + try: + # Load LAS file + las = lasio.read(source) + + if path_in_external_file is None or path_in_external_file.strip() == "": + # Return all curves as 2D array (depth, curves) + data = las.data + else: + # Parse mnemonic list (comma or semicolon separated) + mnemonics = [m.strip() for m in path_in_external_file.replace(";", ",").split(",")] + + # Extract specified curves + curves_data = [] + for mnemonic in mnemonics: + if mnemonic in las.keys(): + curves_data.append(las[mnemonic]) + else: + logging.warning(f"Mnemonic '{mnemonic}' not found in LAS file") + + if not curves_data: + logging.error("No valid mnemonics found in LAS file") + return None + + # Stack curves horizontally + data = np.column_stack(curves_data) if len(curves_data) > 1 else np.array(curves_data[0]) + + # Apply slicing if specified + if start_indices is not None or counts is not None: + slices = [] + for dim in range(len(data.shape)): + start = start_indices[dim] if start_indices and dim < len(start_indices) else 0 + count = counts[dim] if counts and dim < len(counts) else data.shape[dim] - start + slices.append(slice(start, start + count)) + data = data[tuple(slices)] + + return np.array(data) + + except Exception as e: + logging.error(f"Failed to read LAS file: {e}") + return None + + def write_array( + self, + target: Union[str, BytesIO, Any], + array: Union[list, np.ndarray], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + **kwargs, + ) -> bool: + """ + Write array to LAS file. + + Args: + target: Path to LAS file + array: NumPy array or list to write + path_in_external_file: Comma-separated list of mnemonics for curves + start_indices: Not used for LAS files + **kwargs: Additional parameters (well_name, field, etc.) + + Returns: + True if successful, False otherwise + """ + try: + # Convert to numpy array + if not isinstance(array, np.ndarray): + array = np.array(array) + + # Create new LAS file + las = lasio.LASFile() + + # Set well information from kwargs + if "well_name" in kwargs: + las.well.WELL = kwargs["well_name"] + if "field" in kwargs: + las.well.FLD = kwargs["field"] + if "company" in kwargs: + las.well.COMP = kwargs["company"] + + # Parse mnemonics if provided + mnemonics = None + if path_in_external_file: + mnemonics = [m.strip() for m in path_in_external_file.replace(";", ",").split(",")] + + # Add curves + if array.ndim == 1: + # Single curve + mnemonic = mnemonics[0] if mnemonics else "DATA" + las.append_curve(mnemonic, array, unit=kwargs.get("unit", "")) + else: + # Multiple curves + for i in range(array.shape[1]): + mnemonic = mnemonics[i] if mnemonics and i < len(mnemonics) else f"CURVE{i}" + las.append_curve(mnemonic, array[:, i], unit=kwargs.get("unit", "")) + + # Write to file + if isinstance(target, str): + las.write(target) + else: + # For BytesIO, write to string then encode + las_str = las.write(None) # Returns string + target.write(las_str.encode("utf-8")) + + return True + + except Exception as e: + logging.error(f"Failed to write LAS file: {e}") + return False + + def get_array_metadata( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[Union[dict, List[dict]]]: + """ + Get metadata for LAS file curves. + + Args: + source: Path to LAS file or BytesIO object + path_in_external_file: Comma-separated list of mnemonics + + Returns: + Dictionary with metadata (shape, dtype, curves, well_info) + """ + try: + las = lasio.read(source) + + # Get curve information + curves_info = [] + for curve in las.curves: + curves_info.append( + { + "mnemonic": curve.mnemonic, + "unit": curve.unit, + "descr": curve.descr, + "data_points": len(curve.data), + } + ) + + # Get overall metadata + metadata = { + "shape": las.data.shape, + "dtype": str(las.data.dtype), + "curves": curves_info, + "well_info": { + "well_name": las.well.WELL.value if hasattr(las.well, "WELL") else None, + "field": las.well.FLD.value if hasattr(las.well, "FLD") else None, + "company": las.well.COMP.value if hasattr(las.well, "COMP") else None, + }, + "version": las.version.VERS.value if hasattr(las.version, "VERS") else None, + } + + return metadata + + except Exception as e: + logging.error(f"Failed to get LAS metadata: {e}") + return None + + def list_arrays(self, source: Union[BytesIO, str, Any]) -> List[str]: + """List all curve mnemonics in LAS file.""" + try: + las = lasio.read(source) + return [curve.mnemonic for curve in las.curves] + except Exception as e: + logging.error(f"Failed to list LAS curves: {e}") + return [] + + def can_handle_file(self, file_path: str) -> bool: + """Check if this handler can process the file.""" + ext = os.path.splitext(file_path)[1].lower() + return ext == ".las" + +else: + + class MockLASArrayHandler(ExternalArrayHandler): + """Mock handler when lasio is not installed.""" + + def __init__(self, max_open_files: int = 3): + super().__init__(max_open_files=max_open_files) + + def open_file_no_cache(self, file_path: str, mode: str = "r") -> Optional[Any]: + """Open a LAS file without using the cache.""" + return None + + def read_array( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[np.ndarray]: + raise MissingExtraInstallation(extra_name="las") + + def write_array( + self, + target: Union[str, BytesIO, Any], + array: Union[list, np.ndarray], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + **kwargs, + ) -> bool: + raise MissingExtraInstallation(extra_name="las") + + def get_array_metadata( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[Union[dict, List[dict]]]: + raise MissingExtraInstallation(extra_name="las") + + def list_arrays(self, source: Union[BytesIO, str, Any]) -> List[str]: + raise MissingExtraInstallation(extra_name="las") + + def can_handle_file(self, file_path: str) -> bool: + """Check if this handler can process the file.""" + ext = os.path.splitext(file_path)[1].lower() + return ext == ".las" + + # Alias so the public name is always importable + LASArrayHandler = MockLASArrayHandler + + +# SEG-Y Handler +if __SEGYIO_MODULE_EXISTS__: + + class SEGYArrayHandler(ExternalArrayHandler): + """Handler for SEG-Y seismic files (.sgy, .segy).""" + + def __init__(self, max_open_files: int = 3): + super().__init__(max_open_files=max_open_files) + + def open_file_no_cache(self, file_path: str, mode: str = "r") -> Optional[Any]: + """Open a SEG-Y file without using the cache.""" + try: + return segyio.open(file_path, mode, ignore_geometry=True) # type: ignore + except Exception as e: + logging.error(f"Failed to open SEG-Y file {file_path}: {e}") + return None + + def read_array( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[np.ndarray]: + """ + Read array from SEG-Y file. + + Args: + source: Path to SEG-Y file + path_in_external_file: Comma-separated list of trace headers or 'traces' for trace data + start_indices: Starting index [trace_start, sample_start] + counts: Number of elements [trace_count, sample_count] + + Returns: + NumPy array with requested data + """ + try: + # SEG-Y requires file path, not BytesIO + if not isinstance(source, str): + logging.error("SEG-Y handler requires file path, not BytesIO") + return None + + with segyio.open(source, "r", ignore_geometry=True) as f: + if path_in_external_file is None or path_in_external_file.strip().lower() == "traces": + # Read trace data + trace_start = start_indices[0] if start_indices and len(start_indices) > 0 else 0 + sample_start = start_indices[1] if start_indices and len(start_indices) > 1 else 0 + + trace_count = counts[0] if counts and len(counts) > 0 else len(f.trace) - trace_start + sample_count = counts[1] if counts and len(counts) > 1 else len(f.samples) - sample_start + + # Read traces + traces = [] + for i in range(trace_start, trace_start + trace_count): + if i < len(f.trace): + trace = f.trace[i][sample_start : sample_start + sample_count] + traces.append(trace) + + return np.array(traces) + else: + # Read trace headers + headers = [h.strip() for h in path_in_external_file.replace(";", ",").split(",")] + + trace_start = start_indices[0] if start_indices and len(start_indices) > 0 else 0 + trace_count = counts[0] if counts and len(counts) > 0 else len(f.trace) - trace_start + + # Extract header values + header_data = [] + for i in range(trace_start, trace_start + trace_count): + if i < len(f.trace): + trace_headers = f.header[i] + header_values = [ + trace_headers.get(segyio.TraceField.__dict__.get(h.upper(), 0), 0) for h in headers + ] + header_data.append(header_values) + + return np.array(header_data) + + except Exception as e: + logging.error(f"Failed to read SEG-Y file: {e}") + return None + + def write_array( + self, + target: Union[str, BytesIO, Any], + array: Union[list, np.ndarray], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + **kwargs, + ) -> bool: + """ + Write array to SEG-Y file. + + Args: + target: Path to SEG-Y file + array: NumPy array (traces x samples) + path_in_external_file: Not used (SEG-Y structure is fixed) + **kwargs: Additional parameters (sample_interval, etc.) + + Returns: + True if successful, False otherwise + """ + try: + if not isinstance(target, str): + logging.error("SEG-Y handler requires file path for writing") + return False + + if not isinstance(array, np.ndarray): + array = np.array(array) + + # Ensure 2D array (traces x samples) + if array.ndim == 1: + array = array.reshape(1, -1) + + n_traces, n_samples = array.shape + + # Create SEG-Y file specification + spec = segyio.spec() + spec.format = kwargs.get("format", 1) # 1 = 4-byte IBM float + spec.samples = range(n_samples) + spec.tracecount = n_traces + + # Write SEG-Y file + with segyio.create(target, spec) as f: + for i in range(n_traces): + f.trace[i] = array[i, :] + + # Set sample interval if provided (in microseconds) + if "sample_interval" in kwargs: + f.bin[segyio.BinField.Interval] = kwargs["sample_interval"] + + return True + + except Exception as e: + logging.error(f"Failed to write SEG-Y file: {e}") + return False + + def get_array_metadata( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[Union[dict, List[dict]]]: + """ + Get metadata for SEG-Y file. + + Returns: + Dictionary with shape, dtype, trace count, sample info + """ + try: + if not isinstance(source, str): + logging.error("SEG-Y handler requires file path") + return None + + with segyio.open(source, "r", ignore_geometry=True) as f: + metadata = { + "shape": (len(f.trace), len(f.samples)), + "dtype": str(f.dtype), + "trace_count": len(f.trace), + "sample_count": len(f.samples), + "sample_interval": f.bin[segyio.BinField.Interval], + "format": f.format, + "samples": f.samples.tolist() if hasattr(f.samples, "tolist") else list(f.samples), + } + + return metadata + + except Exception as e: + logging.error(f"Failed to get SEG-Y metadata: {e}") + return None + + def list_arrays(self, source: Union[BytesIO, str, Any]) -> List[str]: + """List available data in SEG-Y file (always 'traces').""" + return ["traces"] + + def can_handle_file(self, file_path: str) -> bool: + """Check if this handler can process the file.""" + ext = os.path.splitext(file_path)[1].lower() + return ext in [".sgy", ".segy"] + +else: + + class MockSEGYArrayHandler(ExternalArrayHandler): + """Mock handler when segyio is not installed.""" + + def __init__(self, max_open_files: int = 3): + super().__init__(max_open_files=max_open_files) + + def open_file_no_cache(self, file_path: str, mode: str = "r") -> Optional[Any]: + """Open a SEG-Y file without using the cache.""" + return None + + def read_array( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[np.ndarray]: + raise MissingExtraInstallation(extra_name="segy") + + def write_array( + self, + target: Union[str, BytesIO, Any], + array: Union[list, np.ndarray], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + **kwargs, + ) -> bool: + raise MissingExtraInstallation(extra_name="segy") + + def get_array_metadata( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[Union[dict, List[dict]]]: + raise MissingExtraInstallation(extra_name="segy") + + def list_arrays(self, source: Union[BytesIO, str, Any]) -> List[str]: + raise MissingExtraInstallation(extra_name="segy") + + def can_handle_file(self, file_path: str) -> bool: + """Check if this handler can process the file.""" + ext = os.path.splitext(file_path)[1].lower() + return ext in [".sgy", ".segy"] + + # Alias so the public name is always importable + SEGYArrayHandler = MockSEGYArrayHandler diff --git a/energyml-utils/src/energyml/utils/data/export.py b/energyml-utils/src/energyml/utils/data/export.py index 48d9681..3cb4f67 100644 --- a/energyml-utils/src/energyml/utils/data/export.py +++ b/energyml-utils/src/energyml/utils/data/export.py @@ -2,19 +2,62 @@ # SPDX-License-Identifier: Apache-2.0 """ Module for exporting mesh data to various file formats. -Supports OBJ, GeoJSON, VTK, and STL formats. + +Supports OBJ, GeoJSON, VTK Legacy (ASCII + binary), VTK XML (.vtu / .vtp), +and STL formats. + +Both the legacy :class:`AbstractMesh` hierarchy (``mesh.py``) and the +high-performance :class:`NumpyMesh` / :class:`NumpyMultiMesh` hierarchy +(``mesh_numpy.py``) are accepted by every export function. + +CRS-displacement can be applied at export time (rather than at read time) by +passing ``use_crs_displacement=True`` (default) when a workspace is reachable +through the ``contexts`` dict. The original ``NumpyMesh.points`` arrays are +**never mutated** — a copy is made whenever CRS needs to be applied. + +Color metadata is sourced from :class:`RepresentationContext` objects keyed +by ``source_uuid``; if none are provided a default palette is used. """ +from __future__ import annotations + +import base64 import json +import logging import struct from enum import Enum from pathlib import Path -from typing import TYPE_CHECKING, BinaryIO, List, Optional, TextIO, Union +from typing import TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, TextIO, Union import numpy as np if TYPE_CHECKING: - from .mesh import AbstractMesh + from energyml.utils.data.mesh import AbstractMesh + from energyml.utils.data.mesh_numpy import ( + NumpyMesh, + NumpyMultiMesh, + NumpyPolylineMesh, + NumpyPointSetMesh, + NumpySurfaceMesh, + NumpyVolumeMesh, + ) + from energyml.utils.data.representation_context import RepresentationContext + +log = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# VTK cell-type constants (subset) +# --------------------------------------------------------------------------- +_VTK_VERTEX = 1 +_VTK_POLY_LINE = 4 +_VTK_TRIANGLE = 5 +_VTK_POLYGON = 7 +_VTK_TETRA = 10 +_VTK_HEXAHEDRON = 12 + +# --------------------------------------------------------------------------- +# Enumerations / option classes +# --------------------------------------------------------------------------- class ExportFormat(Enum): @@ -23,6 +66,8 @@ class ExportFormat(Enum): OBJ = "obj" GEOJSON = "geojson" VTK = "vtk" + VTU = "vtu" + VTP = "vtp" STL = "stl" @classmethod @@ -43,35 +88,61 @@ def all_extensions(cls) -> List[str]: class ExportOptions: """Base class for export options.""" - pass - class STLExportOptions(ExportOptions): """Options for STL export.""" def __init__(self, binary: bool = True, ascii_precision: int = 6): """ - Initialize STL export options. - - :param binary: If True, export as binary STL; if False, export as ASCII STL - :param ascii_precision: Number of decimal places for ASCII format + :param binary: If True, export as binary STL; if False, export as ASCII STL. + :param ascii_precision: Number of decimal places for ASCII format. """ self.binary = binary self.ascii_precision = ascii_precision +class VTKFormat(Enum): + """Sub-format selector for VTK export.""" + + LEGACY_ASCII = "legacy_ascii" + """VTK legacy format, ASCII encoding (version 3.0).""" + + LEGACY_BINARY = "legacy_binary" + """VTK legacy format, big-endian binary encoding (version 3.0).""" + + VTU = "vtu" + """VTK XML UnstructuredGrid (.vtu) — best for volumetric meshes.""" + + VTP = "vtp" + """VTK XML PolyData (.vtp) — best for surface / polyline meshes.""" + + class VTKExportOptions(ExportOptions): """Options for VTK export.""" - def __init__(self, binary: bool = False, dataset_name: str = "mesh"): + def __init__( + self, + vtk_format: VTKFormat = VTKFormat.LEGACY_ASCII, + dataset_name: str = "mesh", + # Legacy compatibility: binary=True is equivalent to vtk_format=VTKFormat.LEGACY_BINARY + binary: bool = False, + ): """ - Initialize VTK export options. - - :param binary: If True, export as binary VTK; if False, export as ASCII VTK - :param dataset_name: Name of the dataset in VTK file + :param vtk_format: VTK sub-format (legacy ASCII, legacy binary, VTU, VTP). + :param dataset_name: Dataset name embedded in legacy VTK header or XML title. + :param binary: Deprecated shorthand; when True, forces LEGACY_BINARY sub-format. """ - self.binary = binary self.dataset_name = dataset_name + if binary and vtk_format == VTKFormat.LEGACY_ASCII: + # Honour the legacy binary=True flag so old call-sites still work. + self.vtk_format = VTKFormat.LEGACY_BINARY + else: + self.vtk_format = vtk_format + + # Backward-compat property so code that reads ``options.binary`` still works. + @property + def binary(self) -> bool: + return self.vtk_format == VTKFormat.LEGACY_BINARY class GeoJSONExportOptions(ExportOptions): @@ -79,204 +150,867 @@ class GeoJSONExportOptions(ExportOptions): def __init__(self, indent: Optional[int] = 2, properties: Optional[dict] = None): """ - Initialize GeoJSON export options. - - :param indent: JSON indentation level (None for compact) - :param properties: Additional properties to include in features + :param indent: JSON indentation level (None for compact output). + :param properties: Extra properties merged into every feature. """ self.indent = indent self.properties = properties or {} -def export_obj(mesh_list: List["AbstractMesh"], out: BinaryIO, obj_name: Optional[str] = None) -> None: - """ - Export mesh data to Wavefront OBJ format. +# --------------------------------------------------------------------------- +# Private helpers +# --------------------------------------------------------------------------- + - :param mesh_list: List of AbstractMesh objects to export - :param out: Binary output stream - :param obj_name: Optional object name for the OBJ file +def _normalize_to_patches(meshes: Any) -> List[Any]: + """Flatten *meshes* into a list of individual mesh patches. + + Handles: + - :class:`NumpyMultiMesh` → calls ``flat_patches()`` + - Single :class:`NumpyMesh` → ``[mesh]`` + - ``list`` / ``tuple`` → recursive + - :class:`AbstractMesh` → passthrough as ``[mesh]`` + """ + from energyml.utils.data.mesh_numpy import NumpyMesh, NumpyMultiMesh + + if isinstance(meshes, NumpyMultiMesh): + return meshes.flat_patches() + if isinstance(meshes, NumpyMesh): + return [meshes] + if isinstance(meshes, (list, tuple)): + result: List[Any] = [] + for m in meshes: + result.extend(_normalize_to_patches(m)) + return result + # AbstractMesh or unknown — pass through as single element + return [meshes] + + +def _parse_vtk_flat_faces(flat: np.ndarray) -> List[np.ndarray]: + """Decode VTK flat face array ``[nv, v0, …, nv, v0, …]`` into a list of + per-face index arrays.""" + faces: List[np.ndarray] = [] + pos = 0 + flat = np.asarray(flat, dtype=np.int64) + while pos < len(flat): + nv = int(flat[pos]) + pos += 1 + if pos + nv > len(flat): + break + faces.append(flat[pos : pos + nv]) + pos += nv + return faces + + +def _parse_vtk_flat_lines(flat: np.ndarray) -> List[np.ndarray]: + """Decode VTK flat lines array ``[n, i0, i1, …, n, i0, …]`` into a list + of per-line index arrays.""" + lines: List[np.ndarray] = [] + pos = 0 + flat = np.asarray(flat, dtype=np.int64) + while pos < len(flat): + n = int(flat[pos]) + pos += 1 + if pos + n > len(flat): + break + lines.append(flat[pos : pos + n]) + pos += n + return lines + + +def _get_export_points( + mesh: Any, + use_crs_displacement: bool, + workspace: Any = None, +) -> np.ndarray: + """Return the point array for *mesh*, optionally applying CRS displacement. + + - For :class:`NumpyMesh`: if ``use_crs_displacement`` is True and a CRS + object is present, returns a *copy* with CRS applied (never mutates the + original ``mesh.points``). + - For :class:`AbstractMesh` (legacy): returns ``mesh.point_list`` as-is; + CRS was already applied by the reader. + """ + from energyml.utils.data.mesh_numpy import NumpyMesh + + if isinstance(mesh, NumpyMesh): + if use_crs_displacement and mesh.crs_object is not None and workspace is not None: + from energyml.utils.data.crs import apply_from_crs_info, extract_crs_info + + crs = mesh.crs_object[0] if isinstance(mesh.crs_object, list) and mesh.crs_object else mesh.crs_object + if crs is not None: + try: + crs_info = extract_crs_info(crs, workspace) + pts = mesh.points.copy() + apply_from_crs_info(pts, crs_info, inplace=True) + return pts + except Exception as exc: # pragma: no cover + log.warning("CRS displacement failed for %s: %s", mesh.source_uuid, exc) + return mesh.points + # AbstractMesh — point_list is a list-of-lists; convert to ndarray for uniform handling + return np.array(getattr(mesh, "point_list", []), dtype=np.float64) + + +def _get_context_color( + source_uuid: Optional[str], + contexts: Optional[Dict[str, Any]], +) -> Optional[tuple]: + """Return an (r, g, b, a) tuple in 0–255 range for *source_uuid*, or None.""" + if not contexts or not source_uuid: + return None + ctx = contexts.get(source_uuid) + if ctx is None: + return None + try: + rendering = ctx.get_default_color() + if rendering is not None and rendering.constant_color is not None: + return rendering.constant_color.to_uint8() + except Exception as exc: # pragma: no cover + log.debug("Failed to read color for %s: %s", source_uuid, exc) + return None + + +def _workspace_from_contexts(contexts: Optional[Dict[str, Any]]) -> Any: + """Return the workspace from the first available RepresentationContext.""" + if not contexts: + return None + for ctx in contexts.values(): + ws = getattr(ctx, "workspace", None) + if ws is not None: + return ws + return None + + +def _get_faces_or_cells(mesh: Any) -> np.ndarray: + """Return the face or cell connectivity array for a NumpyMesh. + + Uses ``mesh.faces`` when present and non-empty, then falls back to + ``mesh.cells``. Avoids the numpy-unsafe ``arr or other`` pattern which + raises ``ValueError`` for arrays with more than one element. """ - # Lazy import to avoid circular dependency - from .mesh import PolylineSetMesh + faces = getattr(mesh, "faces", None) + if faces is not None and len(faces) > 0: + return faces + cells = getattr(mesh, "cells", None) + if cells is not None and len(cells) > 0: + return cells + return np.empty(0, dtype=np.int64) + + +# --------------------------------------------------------------------------- +# OBJ export +# --------------------------------------------------------------------------- + + +def export_obj( + mesh_list: Any, + out: BinaryIO, + obj_name: Optional[str] = None, + contexts: Optional[Dict[str, "RepresentationContext"]] = None, + mtl_out: Optional[BinaryIO] = None, + use_crs_displacement: bool = True, +) -> None: + """Export mesh data to Wavefront OBJ format. + + :param mesh_list: One or more meshes (``AbstractMesh``, ``NumpyMesh``, + ``NumpyMultiMesh``, or a list thereof). + :param out: Binary output stream for the ``.obj`` content. + :param obj_name: Optional object name written to the OBJ header. + :param contexts: Optional dict of :class:`RepresentationContext` keyed by + ``source_uuid``; used to emit companion ``.mtl`` material colours when + *mtl_out* is also provided. + :param mtl_out: Optional binary stream for the companion ``.mtl`` file. + Colour requires *contexts* to be supplied. + :param use_crs_displacement: When True (default), CRS origin offset and + axis transforms are applied to ``NumpyMesh`` points at export time. + """ + from energyml.utils.data.mesh import PolylineSetMesh + from energyml.utils.data.mesh_numpy import NumpyMesh, NumpyPointSetMesh, NumpyPolylineMesh - # Write header - out.write(b"# Generated by energyml-utils a Geosiris python module\n\n") + patches = _normalize_to_patches(mesh_list) + workspace = _workspace_from_contexts(contexts) - # Write object name if provided + out.write(b"# Generated by energyml-utils (Geosiris)\n\n") if obj_name is not None: - out.write(f"o {obj_name}\n\n".encode("utf-8")) + out.write(f"o {obj_name}\n\n".encode()) - point_offset = 0 + mtl_lib_name = obj_name or "materials" + if mtl_out is not None: + out.write(f"mtllib {mtl_lib_name}.mtl\n\n".encode()) + mtl_out.write(b"# MTL generated by energyml-utils\n\n") - for mesh in mesh_list: - # Write group name using mesh identifier or uuid - mesh_id = getattr(mesh, "identifier", None) or getattr(mesh, "uuid", "mesh") - out.write(f"g {mesh_id}\n\n".encode("utf-8")) + point_offset = 0 - # Write vertices - for point in mesh.point_list: - if len(point) > 0: - out.write(f"v {' '.join(map(str, point))}\n".encode("utf-8")) + for mesh in patches: + pts = _get_export_points(mesh, use_crs_displacement, workspace) + patch_label = getattr(mesh, "patch_label", None) or getattr(mesh, "identifier", None) or "mesh" + source_uuid = getattr(mesh, "source_uuid", None) or getattr(mesh, "uuid", None) + patch_idx = getattr(mesh, "patch_index", None) + group_name = f"{source_uuid}_{patch_idx}" if source_uuid and patch_idx is not None else patch_label + + out.write(f"g {group_name}\n\n".encode()) + + # emit material reference when mtl output is available + if mtl_out is not None: + mat_name = f"mat_{group_name}" + color = _get_context_color(source_uuid, contexts) + if color is None: + color = (200, 200, 200, 255) + r, g, b, _a = color + out.write(f"usemtl {mat_name}\n".encode()) + mtl_out.write(f"newmtl {mat_name}\n".encode()) + mtl_out.write(f"Kd {r/255:.6f} {g/255:.6f} {b/255:.6f}\n\n".encode()) + + # write vertices + for pt in pts: + out.write(f"v {pt[0]} {pt[1]} {pt[2]}\n".encode()) + + # write connectivity + if isinstance(mesh, NumpyMesh): + if isinstance(mesh, NumpyPointSetMesh): + # bare vertex elements + for i in range(len(pts)): + out.write(f"p {i + point_offset + 1}\n".encode()) + elif isinstance(mesh, NumpyPolylineMesh): + for seg in _parse_vtk_flat_lines(mesh.lines): + if len(seg) > 1: + idx_str = " ".join(str(i + point_offset + 1) for i in seg) + out.write(f"l {idx_str}\n".encode()) + else: + # NumpySurfaceMesh (or NumpyVolumeMesh — export as faces) + faces_arr = _get_faces_or_cells(mesh) + for face in _parse_vtk_flat_faces(faces_arr): + if len(face) >= 3: + idx_str = " ".join(str(i + point_offset + 1) for i in face) + out.write(f"f {idx_str}\n".encode()) + else: + # AbstractMesh legacy path + indices = mesh.get_indices() + elt = "l" if isinstance(mesh, PolylineSetMesh) else "f" + for elem in indices: + if len(elem) > 1: + idx_str = " ".join(str(i + point_offset + 1) for i in elem) + out.write(f"{elt} {idx_str}\n".encode()) - # Write faces or lines depending on mesh type - indices = mesh.get_indices() - elt_letter = "l" if isinstance(mesh, PolylineSetMesh) else "f" + out.write(b"\n") + point_offset += len(pts) - for face_or_line in indices: - if len(face_or_line) > 1: - # OBJ indices are 1-based - indices_str = " ".join(str(idx + point_offset + 1) for idx in face_or_line) - out.write(f"{elt_letter} {indices_str}\n".encode("utf-8")) - point_offset += len(mesh.point_list) +# --------------------------------------------------------------------------- +# GeoJSON export +# --------------------------------------------------------------------------- def export_geojson( - mesh_list: List["AbstractMesh"], out: TextIO, options: Optional[GeoJSONExportOptions] = None + mesh_list: Any, + out: TextIO, + options: Optional[GeoJSONExportOptions] = None, + contexts: Optional[Dict[str, "RepresentationContext"]] = None, + use_crs_displacement: bool = True, ) -> None: - """ - Export mesh data to GeoJSON format. + """Export mesh data to GeoJSON FeatureCollection. - :param mesh_list: List of AbstractMesh objects to export - :param out: Text output stream - :param options: GeoJSON export options + :param mesh_list: One or more meshes. + :param out: Text output stream. + :param options: GeoJSON export options. + :param contexts: Optional colour / metadata context dict. + :param use_crs_displacement: Apply CRS displacement to ``NumpyMesh`` points. """ - # Lazy import to avoid circular dependency - from .mesh import PolylineSetMesh, SurfaceMesh + from energyml.utils.data.mesh import PolylineSetMesh, SurfaceMesh + from energyml.utils.data.mesh_numpy import NumpyMesh, NumpyPointSetMesh, NumpyPolylineMesh if options is None: options = GeoJSONExportOptions() - features = [] - - for mesh_idx, mesh in enumerate(mesh_list): - indices = mesh.get_indices() - - if isinstance(mesh, PolylineSetMesh): - # Export as LineString features - for line_idx, line_indices in enumerate(indices): - if len(line_indices) < 2: - continue - coordinates = [list(mesh.point_list[idx]) for idx in line_indices] - feature = { - "type": "Feature", - "geometry": {"type": "LineString", "coordinates": coordinates}, - "properties": {"mesh_index": mesh_idx, "line_index": line_idx, **options.properties}, - } - features.append(feature) - - elif isinstance(mesh, SurfaceMesh): - # Export as Polygon features - for face_idx, face_indices in enumerate(indices): - if len(face_indices) < 3: - continue - # GeoJSON Polygon requires closed ring (first point == last point) - coordinates = [list(mesh.point_list[idx]) for idx in face_indices] - coordinates.append(coordinates[0]) # Close the ring - - feature = { - "type": "Feature", - "geometry": {"type": "Polygon", "coordinates": [coordinates]}, - "properties": {"mesh_index": mesh_idx, "face_index": face_idx, **options.properties}, - } - features.append(feature) - - geojson = {"type": "FeatureCollection", "features": features} - - json.dump(geojson, out, indent=options.indent) - - -def export_vtk(mesh_list: List["AbstractMesh"], out: BinaryIO, options: Optional[VTKExportOptions] = None) -> None: + patches = _normalize_to_patches(mesh_list) + workspace = _workspace_from_contexts(contexts) + features: List[dict] = [] + + for mesh in patches: + pts = _get_export_points(mesh, use_crs_displacement, workspace) + source_uuid = getattr(mesh, "source_uuid", None) + patch_idx = getattr(mesh, "patch_index", None) + color = _get_context_color(source_uuid, contexts) + base_props: dict = { + **options.properties, + "source_uuid": source_uuid, + "patch_index": patch_idx, + } + if color: + r, g, b, a = color + base_props["color"] = f"#{r:02x}{g:02x}{b:02x}" + base_props["opacity"] = round(a / 255.0, 4) + + if isinstance(mesh, NumpyMesh): + if isinstance(mesh, NumpyPointSetMesh): + coords = pts.tolist() + features.append( + { + "type": "Feature", + "geometry": {"type": "MultiPoint", "coordinates": coords}, + "properties": base_props, + } + ) + elif isinstance(mesh, NumpyPolylineMesh): + for seg in _parse_vtk_flat_lines(mesh.lines): + if len(seg) < 2: + continue + coords = pts[seg].tolist() + features.append( + { + "type": "Feature", + "geometry": {"type": "LineString", "coordinates": coords}, + "properties": base_props, + } + ) + else: + # NumpySurfaceMesh / NumpyVolumeMesh + for face in _parse_vtk_flat_faces(_get_faces_or_cells(mesh)): + if len(face) < 3: + continue + coords = pts[face].tolist() + coords.append(coords[0]) # close ring + features.append( + { + "type": "Feature", + "geometry": {"type": "Polygon", "coordinates": [coords]}, + "properties": base_props, + } + ) + else: + # AbstractMesh legacy path + indices = mesh.get_indices() + for elem_idx, elem in enumerate(indices): + if isinstance(mesh, PolylineSetMesh): + if len(elem) < 2: + continue + coords = [list(pts[i]) for i in elem] + features.append( + { + "type": "Feature", + "geometry": {"type": "LineString", "coordinates": coords}, + "properties": {**base_props, "element_index": elem_idx}, + } + ) + elif isinstance(mesh, SurfaceMesh): + if len(elem) < 3: + continue + coords = [list(pts[i]) for i in elem] + coords.append(coords[0]) + features.append( + { + "type": "Feature", + "geometry": {"type": "Polygon", "coordinates": [coords]}, + "properties": {**base_props, "element_index": elem_idx}, + } + ) + + json.dump({"type": "FeatureCollection", "features": features}, out, indent=options.indent) + + +# --------------------------------------------------------------------------- +# VTK export — private helpers +# --------------------------------------------------------------------------- + + +def _b64_vtk(arr: np.ndarray) -> str: + """Base64-encode a numpy array for VTK XML inline binary format. + + VTK prepends a 4-byte uint32 header with the byte count of the payload. """ - Export mesh data to VTK legacy format. - - :param mesh_list: List of AbstractMesh objects to export - :param out: Binary output stream - :param options: VTK export options + raw = arr.tobytes() + header = struct.pack(" str: + """Return a VTK XML ```` element string (base64 inline).""" + return ( + f'' + f"{_b64_vtk(arr)}" + f"" + ) + + +def _collect_vtk_geometry( + patches: List[Any], + use_crs_displacement: bool, + workspace: Any, +) -> tuple: + """Merge all patches into flat VTK geometry arrays. + + Returns: + (all_pts, poly_conn, poly_off, line_conn, line_off, + vert_conn, vert_off, cell_types, patch_meta) + + *patch_meta* is a list of ``(source_uuid, n_cells)`` tuples used to + assign per-cell colour data. """ - # Lazy import to avoid circular dependency - from .mesh import PolylineSetMesh, SurfaceMesh + from energyml.utils.data.mesh import PolylineSetMesh, SurfaceMesh + from energyml.utils.data.mesh_numpy import NumpyMesh, NumpyPointSetMesh, NumpyPolylineMesh + + all_pts: List[np.ndarray] = [] + poly_conn: List[int] = [] + poly_off: List[int] = [] + line_conn: List[int] = [] + line_off: List[int] = [] + vert_conn: List[int] = [] + vert_off: List[int] = [] + cell_types: List[int] = [] + patch_meta: List[tuple] = [] # (source_uuid, cell_count) + + pt_offset = 0 + + for mesh in patches: + pts = _get_export_points(mesh, use_crs_displacement, workspace) + all_pts.append(np.asarray(pts, dtype=np.float64).reshape(-1, 3)) + source_uuid = getattr(mesh, "source_uuid", None) + cell_count = 0 + + if isinstance(mesh, NumpyMesh): + if isinstance(mesh, NumpyPointSetMesh): + for i in range(len(pts)): + vert_conn.append(i + pt_offset) + vert_off.append(len(vert_conn)) + cell_types.append(_VTK_VERTEX) + cell_count += 1 + elif isinstance(mesh, NumpyPolylineMesh): + for seg in _parse_vtk_flat_lines(mesh.lines): + for vi in seg: + line_conn.append(int(vi) + pt_offset) + line_off.append(len(line_conn)) + cell_types.append(_VTK_POLY_LINE) + cell_count += 1 + else: + faces_arr = _get_faces_or_cells(mesh) + for face in _parse_vtk_flat_faces(faces_arr): + nv = len(face) + for vi in face: + poly_conn.append(int(vi) + pt_offset) + poly_off.append(len(poly_conn)) + cell_types.append(_VTK_TRIANGLE if nv == 3 else _VTK_POLYGON) + cell_count += 1 + else: + # AbstractMesh legacy + indices = mesh.get_indices() + if isinstance(mesh, PolylineSetMesh): + for line in indices: + for vi in line: + line_conn.append(int(vi) + pt_offset) + line_off.append(len(line_conn)) + cell_types.append(_VTK_POLY_LINE) + cell_count += 1 + else: + for face in indices: + nv = len(face) + for vi in face: + poly_conn.append(int(vi) + pt_offset) + poly_off.append(len(poly_conn)) + cell_types.append(_VTK_TRIANGLE if nv == 3 else _VTK_POLYGON) + cell_count += 1 + + pt_offset += len(pts) + patch_meta.append((source_uuid, cell_count)) + + merged_pts = np.concatenate(all_pts) if all_pts else np.empty((0, 3), dtype=np.float64) + return ( + merged_pts, + np.array(poly_conn, dtype=np.int64), + np.array(poly_off, dtype=np.int64), + np.array(line_conn, dtype=np.int64), + np.array(line_off, dtype=np.int64), + np.array(vert_conn, dtype=np.int64), + np.array(vert_off, dtype=np.int64), + np.array(cell_types, dtype=np.uint8), + patch_meta, + ) + + +def _build_color_scalars( + patch_meta: List[tuple], + contexts: Optional[Dict[str, Any]], + total_cells: int, +) -> Optional[np.ndarray]: + """Build a ``(total_cells, 4)`` float32 RGBA array, or None when no colors found.""" + if not contexts: + return None + colors = np.full((total_cells, 4), 0.8, dtype=np.float32) + colors[:, 3] = 1.0 + any_found = False + cell_idx = 0 + for source_uuid, n_cells in patch_meta: + rgba = _get_context_color(source_uuid, contexts) + if rgba is not None: + any_found = True + r, g, b, a = rgba + colors[cell_idx : cell_idx + n_cells, 0] = r / 255.0 + colors[cell_idx : cell_idx + n_cells, 1] = g / 255.0 + colors[cell_idx : cell_idx + n_cells, 2] = b / 255.0 + colors[cell_idx : cell_idx + n_cells, 3] = a / 255.0 + cell_idx += n_cells + return colors if any_found else None + + +# --------------------------------------------------------------------------- +# VTK export — legacy (ASCII / binary) +# --------------------------------------------------------------------------- + + +def _export_vtk_legacy( + patches: List[Any], + out: BinaryIO, + options: VTKExportOptions, + contexts: Optional[Dict[str, Any]], + workspace: Any, +) -> None: + ascii_mode = options.vtk_format == VTKFormat.LEGACY_ASCII + ( + all_pts, + poly_conn, + poly_off, + line_conn, + line_off, + vert_conn, + vert_off, + cell_types, + patch_meta, + ) = _collect_vtk_geometry(patches, True, workspace) + + n_pts = len(all_pts) + n_poly = len(poly_off) + n_line = len(line_off) + n_vert = len(vert_off) + + def _unflatten(conn: np.ndarray, offs: np.ndarray) -> List[List[int]]: + result = [] + prev = 0 + for o in offs: + result.append(conn[prev:o].tolist()) + prev = o + return result + + polygons = _unflatten(poly_conn, poly_off) + lines = _unflatten(line_conn, line_off) + verts = _unflatten(vert_conn, vert_off) + + out.write(b"# vtk DataFile Version 3.0\n") + out.write(f"{options.dataset_name}\n".encode()) + out.write(b"ASCII\n" if ascii_mode else b"BINARY\n") + out.write(b"DATASET POLYDATA\n") + if ascii_mode: + out.write(f"POINTS {n_pts} float\n".encode()) + for pt in all_pts: + out.write(f"{pt[0]} {pt[1]} {pt[2]}\n".encode()) + else: + out.write(f"POINTS {n_pts} float\n".encode()) + out.write(all_pts.astype(">f4").tobytes()) + out.write(b"\n") + + def _write_section(name: str, cells: List[List[int]]) -> None: + if not cells: + return + total = sum(len(c) + 1 for c in cells) + out.write(f"{name} {len(cells)} {total}\n".encode()) + if ascii_mode: + for c in cells: + out.write(f"{len(c)} {' '.join(str(i) for i in c)}\n".encode()) + else: + for c in cells: + row = np.array([len(c)] + c, dtype=np.int32).byteswap().astype(">i4") + out.write(row.tobytes()) + out.write(b"\n") + + _write_section("POLYGONS", polygons) + _write_section("LINES", lines) + _write_section("VERTICES", verts) + + total_cells = n_poly + n_line + n_vert + if total_cells > 0 and contexts: + colors = _build_color_scalars(patch_meta, contexts, total_cells) + if colors is not None: + out.write(f"CELL_DATA {total_cells}\n".encode()) + out.write(b"COLOR_SCALARS patch_color 4\n") + if ascii_mode: + for row in colors: + out.write(f"{row[0]:.6f} {row[1]:.6f} {row[2]:.6f} {row[3]:.6f}\n".encode()) + else: + out.write(colors.astype(">f4").tobytes()) + out.write(b"\n") + + +# --------------------------------------------------------------------------- +# VTK export — XML VTU +# --------------------------------------------------------------------------- + + +def _export_vtk_vtu( + patches: List[Any], + out: BinaryIO, + options: VTKExportOptions, + contexts: Optional[Dict[str, Any]], + workspace: Any, +) -> None: + """Write VTK XML UnstructuredGrid (.vtu).""" + ( + all_pts, + poly_conn, + poly_off, + line_conn, + line_off, + vert_conn, + vert_off, + cell_types, + patch_meta, + ) = _collect_vtk_geometry(patches, True, workspace) + + # Build a single merged connectivity / offsets / types for UnstructuredGrid. + conn_parts: List[np.ndarray] = [] + off_parts: List[int] = [] + types_list: List[int] = [] + running = 0 + + def _add_vtu_section(conn: np.ndarray, offs: np.ndarray, default_type: int) -> None: + nonlocal running + prev = 0 + for o in offs: + seg = conn[prev:o] + conn_parts.append(seg) + running += len(seg) + off_parts.append(running) + types_list.append(default_type) + prev = o + + _add_vtu_section(vert_conn, vert_off, _VTK_VERTEX) + _add_vtu_section(line_conn, line_off, _VTK_POLY_LINE) + + # Polygons: honour per-cell type from cell_types array (triangle vs polygon). + n_verts_cells = len(vert_off) + n_lines_cells = len(line_off) + prev = 0 + for poly_i, o in enumerate(poly_off): + seg = poly_conn[prev:o] + conn_parts.append(seg) + running += len(seg) + off_parts.append(running) + abs_idx = n_verts_cells + n_lines_cells + poly_i + types_list.append(int(cell_types[abs_idx]) if abs_idx < len(cell_types) else _VTK_POLYGON) + prev = o + + all_conn = ( + np.concatenate([np.asarray(p, dtype=np.int64) for p in conn_parts]) + if conn_parts + else np.empty(0, dtype=np.int64) + ) + all_off = np.array(off_parts, dtype=np.int64) + all_types = np.array(types_list, dtype=np.uint8) + n_cells = len(all_types) + n_pts = len(all_pts) + + xml_lines: List[str] = [ + '', + '', + " ", + f' ', + " ", + " " + _vtk_xml_data_array("Points", all_pts.astype(np.float32).ravel(), 3, "Float32"), + " ", + " ", + " " + _vtk_xml_data_array("connectivity", all_conn, 1, "Int64"), + " " + _vtk_xml_data_array("offsets", all_off, 1, "Int64"), + " " + _vtk_xml_data_array("types", all_types, 1, "UInt8"), + " ", + ] + + if contexts and n_cells > 0: + colors = _build_color_scalars(patch_meta, contexts, n_cells) + if colors is not None: + xml_lines.append(" ") + xml_lines.append(" " + _vtk_xml_data_array("patch_color", colors.ravel(), 4, "Float32")) + xml_lines.append(" ") + + xml_lines += [" ", " ", ""] + out.write("\n".join(xml_lines).encode("utf-8")) + + +# --------------------------------------------------------------------------- +# VTK export — XML VTP +# --------------------------------------------------------------------------- + + +def _export_vtk_vtp( + patches: List[Any], + out: BinaryIO, + options: VTKExportOptions, + contexts: Optional[Dict[str, Any]], + workspace: Any, +) -> None: + """Write VTK XML PolyData (.vtp).""" + ( + all_pts, + poly_conn, + poly_off, + line_conn, + line_off, + vert_conn, + vert_off, + cell_types, + patch_meta, + ) = _collect_vtk_geometry(patches, True, workspace) + + n_pts = len(all_pts) + n_polys = len(poly_off) + n_lines = len(line_off) + n_verts = len(vert_off) + total_cells = n_polys + n_lines + n_verts + + xml_lines: List[str] = [ + '', + '', + " ", + ( + f' ' + ), + " ", + " " + _vtk_xml_data_array("Points", all_pts.astype(np.float32).ravel(), 3, "Float32"), + " ", + ] + + def _topo_section(tag: str, conn: np.ndarray, offs: np.ndarray) -> List[str]: + return [ + f" <{tag}>", + " " + _vtk_xml_data_array("connectivity", conn, 1, "Int64"), + " " + _vtk_xml_data_array("offsets", offs, 1, "Int64"), + f" ", + ] + + if n_polys: + xml_lines.extend(_topo_section("Polys", poly_conn, poly_off)) + if n_lines: + xml_lines.extend(_topo_section("Lines", line_conn, line_off)) + if n_verts: + xml_lines.extend(_topo_section("Verts", vert_conn, vert_off)) + + if contexts and total_cells > 0: + colors = _build_color_scalars(patch_meta, contexts, total_cells) + if colors is not None: + xml_lines.append(" ") + xml_lines.append(" " + _vtk_xml_data_array("patch_color", colors.ravel(), 4, "Float32")) + xml_lines.append(" ") + + xml_lines += [" ", " ", ""] + out.write("\n".join(xml_lines).encode("utf-8")) + + +# --------------------------------------------------------------------------- +# VTK export — public entry point +# --------------------------------------------------------------------------- + + +def export_vtk( + mesh_list: Any, + out: BinaryIO, + options: Optional[VTKExportOptions] = None, + contexts: Optional[Dict[str, "RepresentationContext"]] = None, + use_crs_displacement: bool = True, +) -> None: + """Export mesh data to a VTK format. + + The sub-format is controlled by ``options.vtk_format`` (default: + ``VTKFormat.LEGACY_ASCII``). Supported variants: + + * **LEGACY_ASCII** — VTK 3.0 POLYDATA, ASCII encoding + * **LEGACY_BINARY** — VTK 3.0 POLYDATA, big-endian binary encoding + * **VTU** — VTK XML UnstructuredGrid (``.vtu``), base64 inline binary + * **VTP** — VTK XML PolyData (``.vtp``), base64 inline binary + + :param mesh_list: Meshes to export. + :param out: Binary output stream. + :param options: VTK export options. + :param contexts: Optional colour context dict keyed by ``source_uuid``. + :param use_crs_displacement: Apply CRS displacement to ``NumpyMesh`` points. + """ if options is None: options = VTKExportOptions() - # Combine all meshes - all_points = [] - all_polygons = [] - all_lines = [] - vertex_offset = 0 - - for mesh in mesh_list: - all_points.extend(mesh.point_list) - indices = mesh.get_indices() - - if isinstance(mesh, SurfaceMesh): - # Adjust face indices - for face in indices: - adjusted_face = [idx + vertex_offset for idx in face] - all_polygons.append(adjusted_face) - elif isinstance(mesh, PolylineSetMesh): - # Adjust line indices - for line in indices: - adjusted_line = [idx + vertex_offset for idx in line] - all_lines.append(adjusted_line) - - vertex_offset += len(mesh.point_list) - - # Write VTK header - out.write(b"# vtk DataFile Version 3.0\n") - out.write(f"{options.dataset_name}\n".encode("utf-8")) - out.write(b"ASCII\n") - out.write(b"DATASET POLYDATA\n") + patches = _normalize_to_patches(mesh_list) + # Pass workspace only when CRS displacement is actually requested. + workspace = _workspace_from_contexts(contexts) if use_crs_displacement else None - # Write points - out.write(f"POINTS {len(all_points)} float\n".encode("utf-8")) - for point in all_points: - out.write(f"{point[0]} {point[1]} {point[2]}\n".encode("utf-8")) + fmt = options.vtk_format + if fmt in (VTKFormat.LEGACY_ASCII, VTKFormat.LEGACY_BINARY): + _export_vtk_legacy(patches, out, options, contexts, workspace) + elif fmt == VTKFormat.VTU: + _export_vtk_vtu(patches, out, options, contexts, workspace) + elif fmt == VTKFormat.VTP: + _export_vtk_vtp(patches, out, options, contexts, workspace) + else: # pragma: no cover + raise ValueError(f"Unknown VTKFormat: {fmt}") - # Write polygons - if all_polygons: - total_poly_size = sum(len(poly) + 1 for poly in all_polygons) - out.write(f"POLYGONS {len(all_polygons)} {total_poly_size}\n".encode("utf-8")) - for poly in all_polygons: - out.write(f"{len(poly)} {' '.join(str(idx) for idx in poly)}\n".encode("utf-8")) - # Write lines - if all_lines: - total_line_size = sum(len(line) + 1 for line in all_lines) - out.write(f"LINES {len(all_lines)} {total_line_size}\n".encode("utf-8")) - for line in all_lines: - out.write(f"{len(line)} {' '.join(str(idx) for idx in line)}\n".encode("utf-8")) +# --------------------------------------------------------------------------- +# STL export +# --------------------------------------------------------------------------- -def export_stl(mesh_list: List["AbstractMesh"], out: BinaryIO, options: Optional[STLExportOptions] = None) -> None: - """ - Export mesh data to STL format (binary or ASCII). +def export_stl( + mesh_list: Any, + out: BinaryIO, + options: Optional[STLExportOptions] = None, + use_crs_displacement: bool = True, +) -> None: + """Export triangulated mesh data to STL format (binary or ASCII). - Note: STL format only supports triangles. Only triangular faces will be exported. + Non-triangular polygons are fan-triangulated (vertex 0 + consecutive pairs). + Polylines and point sets are silently skipped. - :param mesh_list: List of AbstractMesh objects to export - :param out: Binary output stream - :param options: STL export options + :param mesh_list: Meshes to export. + :param out: Binary output stream. + :param options: STL export options. + :param use_crs_displacement: Apply CRS displacement to ``NumpyMesh`` points. """ - # Lazy import to avoid circular dependency - from .mesh import SurfaceMesh + from energyml.utils.data.mesh import SurfaceMesh + from energyml.utils.data.mesh_numpy import NumpyMesh, NumpyPolylineMesh, NumpyPointSetMesh if options is None: options = STLExportOptions(binary=True) - # Collect all triangles (only from SurfaceMesh with triangular faces) - all_triangles = [] - for mesh in mesh_list: - if isinstance(mesh, SurfaceMesh): - indices = mesh.get_indices() - for face in indices: - # Only export triangular faces - if len(face) == 3: - p0 = np.array(mesh.point_list[face[0]]) - p1 = np.array(mesh.point_list[face[1]]) - p2 = np.array(mesh.point_list[face[2]]) - all_triangles.append((p0, p1, p2)) + patches = _normalize_to_patches(mesh_list) + # STL carries no colour / context; workspace not needed unless CRS is requested. + workspace = None # CRS requires a workspace — callers may read with CRS pre-applied. + + all_triangles: List[tuple] = [] + + for mesh in patches: + if isinstance(mesh, (NumpyPolylineMesh, NumpyPointSetMesh)): + continue # STL is surface-only + pts = _get_export_points(mesh, use_crs_displacement, workspace) + pts_np = np.asarray(pts, dtype=np.float64).reshape(-1, 3) + + if isinstance(mesh, NumpyMesh): + face_list = _parse_vtk_flat_faces(_get_faces_or_cells(mesh)) + else: + if not isinstance(mesh, SurfaceMesh): + continue + face_list = mesh.get_indices() + + for face in face_list: + face = list(face) + if len(face) < 3: + continue + if len(face) == 3: + all_triangles.append((pts_np[face[0]], pts_np[face[1]], pts_np[face[2]])) + else: + # Fan triangulation for quads and polygons + for j in range(1, len(face) - 1): + all_triangles.append((pts_np[face[0]], pts_np[face[j]], pts_np[face[j + 1]])) if options.binary: _export_stl_binary(all_triangles, out) @@ -284,206 +1018,171 @@ def export_stl(mesh_list: List["AbstractMesh"], out: BinaryIO, options: Optional _export_stl_ascii(all_triangles, out, options.ascii_precision) +def _compute_normal(p0: np.ndarray, p1: np.ndarray, p2: np.ndarray) -> np.ndarray: + v1, v2 = p1 - p0, p2 - p0 + n = np.cross(v1, v2) + norm = np.linalg.norm(n) + return n / norm if norm > 0 else np.zeros(3) + + def _export_stl_binary(triangles: List[tuple], out: BinaryIO) -> None: - """Export STL in binary format.""" - # Write 80-byte header header = b"Binary STL file generated by energyml-utils" + b"\0" * (80 - 44) out.write(header) - - # Write number of triangles out.write(struct.pack(" 0: - normal = normal / norm - else: - normal = np.array([0.0, 0.0, 0.0]) - - # Write normal - out.write(struct.pack(" None: - """Export STL in ASCII format.""" out.write(b"solid mesh\n") - for p0, p1, p2 in triangles: - # Calculate normal vector - v1 = p1 - p0 - v2 = p2 - p0 - normal = np.cross(v1, v2) - norm = np.linalg.norm(normal) - if norm > 0: - normal = normal / norm - else: - normal = np.array([0.0, 0.0, 0.0]) - - # Write facet - line = f" facet normal {normal[0]:.{precision}e} {normal[1]:.{precision}e} {normal[2]:.{precision}e}\n" - out.write(line.encode("utf-8")) + normal = _compute_normal(p0, p1, p2) + out.write( + f" facet normal {normal[0]:.{precision}e} {normal[1]:.{precision}e} {normal[2]:.{precision}e}\n".encode() + ) out.write(b" outer loop\n") + for pt in (p0, p1, p2): + out.write(f" vertex {pt[0]:.{precision}e} {pt[1]:.{precision}e} {pt[2]:.{precision}e}\n".encode()) + out.write(b" endloop\n endfacet\n") + out.write(b"endsolid mesh\n") - for point in [p0, p1, p2]: - line = f" vertex {point[0]:.{precision}e} {point[1]:.{precision}e} {point[2]:.{precision}e}\n" - out.write(line.encode("utf-8")) - - out.write(b" endloop\n") - out.write(b" endfacet\n") - out.write(b"endsolid mesh\n") +# --------------------------------------------------------------------------- +# High-level dispatcher +# --------------------------------------------------------------------------- def export_mesh( - mesh_list: List["AbstractMesh"], + mesh_list: Any, output_path: Union[str, Path], format: Optional[ExportFormat] = None, options: Optional[ExportOptions] = None, + contexts: Optional[Dict[str, "RepresentationContext"]] = None, + use_crs_displacement: bool = True, ) -> None: - """ - Export mesh data to a file in the specified format. - - :param mesh_list: List of Mesh objects to export - :param output_path: Output file path - :param format: Export format (auto-detected from extension if None) - :param options: Format-specific export options + """Export mesh data to a file. + + Format is auto-detected from the file extension when *format* is None. + Supported extensions: ``.obj``, ``.geojson``, ``.vtk``, ``.vtu``, + ``.vtp``, ``.stl``. + + :param mesh_list: Meshes to export. + :param output_path: Destination file path. + :param format: Explicit format; auto-detected from extension when None. + :param options: Format-specific options. + :param contexts: Color / metadata context dict. + :param use_crs_displacement: Apply CRS displacement to ``NumpyMesh`` points. """ path = Path(output_path) - - # Auto-detect format from extension if not specified if format is None: format = ExportFormat.from_extension(path.suffix) - # Determine if file should be opened in binary or text mode - binary_formats = {ExportFormat.OBJ, ExportFormat.STL, ExportFormat.VTK} - text_formats = {ExportFormat.GEOJSON} - - if format in binary_formats: - with path.open("wb") as f: - if format == ExportFormat.OBJ: - export_obj(mesh_list, f) - elif format == ExportFormat.STL: - export_stl(mesh_list, f, options) - elif format == ExportFormat.VTK: - export_vtk(mesh_list, f, options) - elif format in text_formats: + if format == ExportFormat.GEOJSON: with path.open("w", encoding="utf-8") as f: - if format == ExportFormat.GEOJSON: - export_geojson(mesh_list, f, options) - else: - raise ValueError(f"Unsupported format: {format}") + export_geojson(mesh_list, f, options, contexts, use_crs_displacement) + return + + # All remaining formats use binary streams + with path.open("wb") as f: + if format == ExportFormat.OBJ: + if contexts: + mtl_path = path.with_suffix(".mtl") + with mtl_path.open("wb") as mf: + export_obj(mesh_list, f, path.stem, contexts, mf, use_crs_displacement) + else: + export_obj(mesh_list, f, path.stem, None, None, use_crs_displacement) + elif format == ExportFormat.STL: + export_stl(mesh_list, f, options, use_crs_displacement) + elif format == ExportFormat.VTK: + export_vtk(mesh_list, f, options, contexts, use_crs_displacement) + elif format == ExportFormat.VTU: + vtk_opts = options if isinstance(options, VTKExportOptions) else VTKExportOptions() + vtk_opts.vtk_format = VTKFormat.VTU + export_vtk(mesh_list, f, vtk_opts, contexts, use_crs_displacement) + elif format == ExportFormat.VTP: + vtk_opts = options if isinstance(options, VTKExportOptions) else VTKExportOptions() + vtk_opts.vtk_format = VTKFormat.VTP + export_vtk(mesh_list, f, vtk_opts, contexts, use_crs_displacement) + else: + raise ValueError(f"Unsupported format: {format}") +# --------------------------------------------------------------------------- # UI Helper Functions +# --------------------------------------------------------------------------- def supported_formats() -> List[str]: - """ - Get list of supported export formats. - - :return: List of format names (e.g., ['obj', 'geojson', 'vtk', 'stl']) - """ + """Return all supported export format extensions.""" return ExportFormat.all_extensions() def format_description(format: Union[str, ExportFormat]) -> str: - """ - Get human-readable description of a format. - - :param format: Format name or ExportFormat enum - :return: Description string - """ + """Return a human-readable description of *format*.""" if isinstance(format, str): format = ExportFormat.from_extension(format) - descriptions = { - ExportFormat.OBJ: "Wavefront OBJ - 3D geometry format (triangles and lines)", - ExportFormat.GEOJSON: "GeoJSON - Geographic data format (lines and polygons)", - ExportFormat.VTK: "VTK Legacy - Visualization Toolkit format", - ExportFormat.STL: "STL - Stereolithography format (triangles only)", + ExportFormat.OBJ: "Wavefront OBJ — 3D geometry with optional .mtl colour", + ExportFormat.GEOJSON: "GeoJSON — geographic data (lines, polygons, point clouds)", + ExportFormat.VTK: "VTK Legacy (ASCII or binary) — POLYDATA format", + ExportFormat.VTU: "VTK XML UnstructuredGrid (.vtu) — volumes + mixed topologies", + ExportFormat.VTP: "VTK XML PolyData (.vtp) — surfaces and polylines", + ExportFormat.STL: "STL — stereolithography (triangles only)", } return descriptions.get(format, "Unknown format") def format_filter_string(format: Union[str, ExportFormat]) -> str: - """ - Get file filter string for UI dialogs (Qt, tkinter, etc.). - - :param format: Format name or ExportFormat enum - :return: Filter string (e.g., "OBJ Files (*.obj)") - """ + """Return a file-dialog filter string (e.g. ``"VTU Files (*.vtu)"``).""" if isinstance(format, str): format = ExportFormat.from_extension(format) - filters = { ExportFormat.OBJ: "OBJ Files (*.obj)", ExportFormat.GEOJSON: "GeoJSON Files (*.geojson)", ExportFormat.VTK: "VTK Files (*.vtk)", + ExportFormat.VTU: "VTK XML UnstructuredGrid Files (*.vtu)", + ExportFormat.VTP: "VTK XML PolyData Files (*.vtp)", ExportFormat.STL: "STL Files (*.stl)", } return filters.get(format, "All Files (*.*)") def all_formats_filter_string() -> str: - """ - Get file filter string for all supported formats. - Useful for Qt QFileDialog or similar UI components. - - :return: Filter string with all formats - """ - filters = [format_filter_string(fmt) for fmt in ExportFormat] - return ";;".join(filters) + """Return a ``;;``-joined filter string for all supported formats.""" + return ";;".join(format_filter_string(fmt) for fmt in ExportFormat) def get_format_options_class(format: Union[str, ExportFormat]) -> Optional[type]: - """ - Get the options class for a specific format. - - :param format: Format name or ExportFormat enum - :return: Options class or None if no options available - """ + """Return the options class for *format*, or None.""" if isinstance(format, str): format = ExportFormat.from_extension(format) - - options_map = { + return { ExportFormat.STL: STLExportOptions, ExportFormat.VTK: VTKExportOptions, + ExportFormat.VTU: VTKExportOptions, + ExportFormat.VTP: VTKExportOptions, ExportFormat.GEOJSON: GeoJSONExportOptions, - } - return options_map.get(format) + }.get(format) def supports_lines(format: Union[str, ExportFormat]) -> bool: - """ - Check if format supports line primitives. - - :param format: Format name or ExportFormat enum - :return: True if format supports lines - """ + """Return True when *format* can represent polyline primitives.""" if isinstance(format, str): format = ExportFormat.from_extension(format) - - return format in {ExportFormat.OBJ, ExportFormat.GEOJSON, ExportFormat.VTK} + return format in {ExportFormat.OBJ, ExportFormat.GEOJSON, ExportFormat.VTK, ExportFormat.VTU, ExportFormat.VTP} def supports_triangles(format: Union[str, ExportFormat]) -> bool: - """ - Check if format supports triangle primitives. + """Return True when *format* can represent triangle / polygon primitives.""" + return True # All formats support triangles - :param format: Format name or ExportFormat enum - :return: True if format supports triangles - """ - # All formats support triangles - return True + +def supports_pointsets(format: Union[str, ExportFormat]) -> bool: + """Return True when *format* can represent point-cloud primitives.""" + if isinstance(format, str): + format = ExportFormat.from_extension(format) + return format in {ExportFormat.OBJ, ExportFormat.GEOJSON, ExportFormat.VTK, ExportFormat.VTU, ExportFormat.VTP} diff --git a/energyml-utils/src/energyml/utils/data/helper.py b/energyml-utils/src/energyml/utils/data/helper.py index 9ebde1d..1781caf 100644 --- a/energyml-utils/src/energyml/utils/data/helper.py +++ b/energyml-utils/src/energyml/utils/data/helper.py @@ -3,27 +3,31 @@ import inspect import logging import sys -from typing import Any, Optional, Callable, List, Union +from typing import Any, Literal, Optional, Callable, List, Tuple, Union from energyml.utils.storage_interface import EnergymlStorageInterface import numpy as np -from .datasets_io import read_external_dataset_array -from ..constants import flatten_concatenation -from ..exception import ObjectNotFoundNotError -from ..introspection import ( +from energyml.utils.data.datasets_io import read_external_dataset_array, get_path_in_external_with_path +from energyml.utils.constants import flatten_concatenation, path_last_attribute, path_parent_attribute +from energyml.utils.exception import ObjectNotFoundNotError +from energyml.utils.introspection import ( get_obj_uri, snake_case, get_object_attribute_no_verif, search_attribute_matching_name_with_path, search_attribute_matching_name, + search_attribute_matching_type, search_attribute_in_upper_matching_name, get_obj_uuid, get_object_attribute, get_object_attribute_rgx, + get_object_attribute_advanced, + is_primitive, + get_obj_title, ) -from .datasets_io import get_path_in_external_with_path +from energyml.utils.data.crs import CrsInfo, extract_crs_info # noqa: F401 (re-exported for convenience) _ARRAY_NAMES_ = [ "BooleanArrayFromDiscretePropertyArray", @@ -81,83 +85,118 @@ def _point_as_array(point: Any) -> List: def is_z_reversed(crs: Optional[Any]) -> bool: """ - Returns True if the Z axe is reverse (ZIncreasingDownward=='True' or VerticalAxis.Direction=='down') + Returns True if the Z axis increases downward + (``ZIncreasingDownward==True`` or ``VerticalAxis.Direction=='down'``). + + Delegates to :func:`extract_crs_info`. + :param crs: a CRS object - :return: By default, False is returned (if 'crs' is None) - """ - reverse_z_values = False - if crs is not None: - if "VerticalCrs" in type(crs).__name__: - vert_axis = search_attribute_matching_name(crs, "Direction") - if len(vert_axis) > 0: - vert_axis_str = str(vert_axis[0]) - if "." in vert_axis_str: - vert_axis_str = vert_axis_str.split(".")[-1] - - reverse_z_values = vert_axis_str.lower() == "down" - else: - # resqml 201 - zincreasing_downward = search_attribute_matching_name(crs, "ZIncreasingDownward") - if len(zincreasing_downward) > 0: - reverse_z_values = zincreasing_downward[0] - - # resqml >= 22 - vert_axis = search_attribute_matching_name(crs, "VerticalAxis.Direction") - if len(vert_axis) > 0: - vert_axis_str = str(vert_axis[0]) - if "." in vert_axis_str: - vert_axis_str = vert_axis_str.split(".")[-1] - - reverse_z_values = vert_axis_str.lower() == "down" - logging.debug(f"is_z_reversed: {reverse_z_values}") - return reverse_z_values - - -def get_vertical_epsg_code(crs_object: Any): - vertical_epsg_code = None - if crs_object is not None: # LocalDepth3dCRS - vertical_epsg_code = get_object_attribute_rgx(crs_object, "VerticalCrs.EpsgCode") - if vertical_epsg_code is None: # LocalEngineering2DCrs - vertical_epsg_code = get_object_attribute_rgx( - crs_object, "OriginProjectedCrs.AbstractProjectedCrs.EpsgCode" - ) - return vertical_epsg_code + :return: By default, ``False`` is returned when *crs* is ``None``. + """ + result = extract_crs_info(crs).z_increasing_downward + logging.debug(f"is_z_reversed: {result}") + return result -def get_projected_epsg_code(crs_object: Any, workspace: Optional[EnergymlStorageInterface] = None): - if crs_object is not None: # LocalDepth3dCRS - projected_epsg_code = get_object_attribute_rgx(crs_object, "ProjectedCrs.EpsgCode") - if projected_epsg_code is None: # LocalEngineering2DCrs - projected_epsg_code = get_object_attribute_rgx( - crs_object, "OriginProjectedCrs.AbstractProjectedCrs.EpsgCode" - ) +def get_vertical_epsg_code(crs_object: Any) -> Optional[int]: + """Return the EPSG code of the vertical CRS. Delegates to :func:`extract_crs_info`.""" + return extract_crs_info(crs_object).vertical_epsg_code - if projected_epsg_code is None and workspace is not None: - return get_projected_epsg_code( - workspace.get_object_by_uuid(get_object_attribute_rgx(crs_object, "LocalEngineering2[dD]Crs.Uuid")) - ) - return projected_epsg_code - return None +def get_projected_epsg_code(crs_object: Any, workspace: Optional[EnergymlStorageInterface] = None) -> Optional[int]: + """Return the EPSG code of the projected (horizontal) CRS. Delegates to :func:`extract_crs_info`.""" + return extract_crs_info(crs_object, workspace).projected_epsg_code -def get_projected_uom(crs_object: Any, workspace: Optional[EnergymlStorageInterface] = None): - if crs_object is not None: - projected_epsg_uom = get_object_attribute_rgx(crs_object, "ProjectedUom") - if projected_epsg_uom is None: - projected_epsg_uom = get_object_attribute_rgx(crs_object, "HorizontalAxes.ProjectedUom") - if projected_epsg_uom is None and workspace is not None: - return get_projected_uom( - workspace.get_object_by_uuid(get_object_attribute_rgx(crs_object, "LocalEngineering2[dD]Crs.Uuid")) - ) - return projected_epsg_uom - return None +def get_projected_uom(crs_object: Any, workspace: Optional[EnergymlStorageInterface] = None) -> Optional[str]: + """Return the UOM string for the projected (horizontal) CRS. Delegates to :func:`extract_crs_info`.""" + return extract_crs_info(crs_object, workspace).projected_uom -def get_crs_origin_offset(crs_obj: Any) -> List[float | int]: +def get_crs_offsets_and_angle( + crs_object: Any, workspace: Optional[EnergymlStorageInterface] = None +) -> Tuple[float, float, float, Tuple[float, str]]: + """ + Return the CRS offsets (X, Y, Z) and the areal rotation angle ``(value, uom)``. + + Delegates to :func:`extract_crs_info` and unpacks the result back into the + original ``(x, y, z, (angle, uom))`` tuple format for backward compatibility. + """ + info = extract_crs_info(crs_object, workspace) + return info.x_offset, info.y_offset, info.z_offset, (info.areal_rotation_value, info.areal_rotation_uom) + + +def apply_crs_transform( + well_points: np.ndarray, + x_offset: float = 0.0, + y_offset: float = 0.0, + z_offset: float = 0.0, + areal_rotation: float = 0.0, + rotation_uom: str = "rad", + z_is_up: bool = True, +) -> np.ndarray: + """ + Transforms interpolated wellbore points from Local CRS to Global/Project coordinates. + + Args: + well_points: A (N, 3) numpy array of interpolated [X, Y, Z] points. + x_offset: The X translation value (resqml:XOffset). + y_offset: The Y translation value (resqml:YOffset). + z_offset: The Z translation value (resqml:ZOffset). + areal_rotation: The rotation angle (azimuth of the local CRS grid). + rotation_uom: The unit of measure for the rotation ('rad' or 'degr'). + z_is_up: If True, converts Z values to 'Up is Positive' (negates RESQML Z). + + Returns: + A (N, 3) numpy array of transformed coordinates. """ - Return a list [X,Y,Z] corresponding to the crs Offset [XOffset/OriginProjectedCoordinate1, ... ] depending on the - crs energyml version. + # Create a copy to avoid mutating the original input array + transformed: np.ndarray = well_points.copy().astype(np.float64) + + # 1. Convert rotation to radians if necessary + angle_rad: float = areal_rotation + if rotation_uom == "degr": + angle_rad = np.radians(areal_rotation) + + # 2. Handle Areal Rotation (Rotation around the Z axis) + # Applied before translation as per Energistics standards. + # RESQML ArealRotation / Azimuth is a CLOCKWISE angle (not the standard + # CCW mathematical convention). The correct CW rotation matrix is: + # x' = x·cos θ + y·sin θ + # y' = −x·sin θ + y·cos θ + if angle_rad != 0.0: + cos_theta = np.cos(angle_rad) + sin_theta = np.sin(angle_rad) + + x_orig = transformed[:, 0].copy() + y_orig = transformed[:, 1].copy() + + # Clockwise rotation (RESQML convention) + transformed[:, 0] = x_orig * cos_theta + y_orig * sin_theta + transformed[:, 1] = -x_orig * sin_theta + y_orig * cos_theta + + # 3. Apply Translation (Offsets) + transformed[:, 0] += x_offset + transformed[:, 1] += y_offset + transformed[:, 2] += z_offset + + # 4. Final Vertical Orientation + # Negate Z if the target system is Z-Up (RESQML is natively Z-Down). + if z_is_up: + transformed[:, 2] = -transformed[:, 2] + + return transformed + + +def get_crs_origin_offset(crs_obj: Any) -> np.ndarray: + """ + Return a ``(3,) float64`` numpy array ``[X, Y, Z]`` corresponding to the + CRS origin offset (``XOffset``/``OriginProjectedCoordinate1``, …) depending + on the energyml version. + + Returning an ndarray instead of a plain list avoids the ``np.asarray()`` + call in callers such as :func:`mesh_numpy.crs_displacement_np`. + :param crs_obj: :return: """ @@ -173,17 +212,94 @@ def get_crs_origin_offset(crs_obj: Any) -> List[float | int]: if tmp_offset_z is None: tmp_offset_z = get_object_attribute_rgx(crs_obj, "OriginProjectedCoordinate3") - crs_point_offset = [0.0, 0.0, 0.0] try: - crs_point_offset = [ - float(tmp_offset_x) if tmp_offset_x is not None else 0.0, - float(tmp_offset_y) if tmp_offset_y is not None else 0.0, - float(tmp_offset_z) if tmp_offset_z is not None else 0.0, - ] + return np.array( + [ + float(tmp_offset_x) if tmp_offset_x is not None else 0.0, + float(tmp_offset_y) if tmp_offset_y is not None else 0.0, + float(tmp_offset_z) if tmp_offset_z is not None else 0.0, + ], + dtype=np.float64, + ) except Exception as e: logging.info(f"ERR reading crs offset {e}") + return np.zeros(3, dtype=np.float64) + + +def get_datum_information( + datum_obj: Any, workspace: Optional[EnergymlStorageInterface] = None +) -> Tuple[float, float, float, bool, Optional[str], Optional[str], Optional[Any]]: + "From a ObjMdDatum or a ReferencePointInACrs, return x, y, z, z_increas_downward, projected_epsg_code, vertical_epsg_code, crs object" + if datum_obj is None: + return 0.0, 0.0, 0.0, False, None, None, None + + t_lw = type(datum_obj).__name__.lower() + + # resqml20.LocalDepth3dCrs + if "localdepth3dcrs" in t_lw: + x = get_object_attribute_rgx(datum_obj, "XOffset.value") + y = get_object_attribute_rgx(datum_obj, "YOffset.value") + z = get_object_attribute_rgx(datum_obj, "ZOffset.value") + z_increasing_downward = get_object_attribute(datum_obj, "ZIncreasingDownward") or False + projected_epsg_code = get_projected_epsg_code(datum_obj, workspace) + vertical_epsg_code = get_vertical_epsg_code(datum_obj) + return ( + float(x) if x is not None else 0.0, + float(y) if y is not None else 0.0, + float(z) if z is not None else 0.0, + z_increasing_downward, + projected_epsg_code, + vertical_epsg_code, + datum_obj, + ) + elif "referencepointinacrs" in t_lw: + x = get_object_attribute_rgx(datum_obj, "horizontal_coordinates.coordinate1") + y = get_object_attribute_rgx(datum_obj, "horizontal_coordinates.coordinate2") + z = get_object_attribute_rgx(datum_obj, "vertical_coordinate") + z_increasing_downward = False + v_crs_dor = get_object_attribute_rgx(datum_obj, "vertical_crs") + if v_crs_dor is not None and workspace is not None: + v_crs = workspace.get_object(get_obj_uri(v_crs_dor)) + if v_crs is not None: + z_increasing_downward = is_z_reversed(v_crs) + p_crs = get_object_attribute(datum_obj, "horizontal_coordinates.crs") + projected_epsg_code = ( + get_projected_epsg_code(workspace.get_object(get_obj_uri(p_crs)), workspace) + if p_crs is not None and workspace is not None + else None + ) + v_crs = get_object_attribute(datum_obj, "vertical_crs") + vertical_epsg_code = get_vertical_epsg_code(v_crs) if v_crs is not None else None + return ( + float(x) if x is not None else 0.0, + float(y) if y is not None else 0.0, + float(z) if z is not None else 0.0, + z_increasing_downward, + projected_epsg_code, + vertical_epsg_code, + p_crs, + ) + elif "mddatum" in t_lw: + x = get_object_attribute_rgx(datum_obj, "location.coordinate1") + y = get_object_attribute_rgx(datum_obj, "location.coordinate2") + z = get_object_attribute_rgx(datum_obj, "location.coordinate3") + crs = get_object_attribute(datum_obj, "LocalCrs") + _, _, _, z_increasing_downward, projected_epsg_code, vertical_epsg_code, _ = get_datum_information( + crs, workspace + ) + return ( + float(x) if x is not None else 0.0, + float(y) if y is not None else 0.0, + float(z) if z is not None else 0.0, + z_increasing_downward, + projected_epsg_code, + vertical_epsg_code, + crs, + ) + return 0.0, 0.0, 0.0, False, None, None, None - return crs_point_offset + +# ================================================== def prod_n_tab(val: Union[float, int, str], tab: List[Union[float, int, str]]): @@ -195,18 +311,18 @@ def prod_n_tab(val: Union[float, int, str], tab: List[Union[float, int, str]]): """ if val is None: return [None] * len(tab) - logging.debug(f"Multiplying list by {val}: {tab}") + # logging.debug(f"Multiplying list by {val}: {tab}") # Convert to numpy array for vectorized operations, handling None values arr = np.array(tab, dtype=object) - logging.debug(f"arr: {arr}") + # logging.debug(f"arr: {arr}") # Create mask for non-None values mask = arr != None # noqa: E711 # Create result array filled with None result = np.full(len(tab), None, dtype=object) - logging.debug(f"result before multiplication: {result}") + # logging.debug(f"result before multiplication: {result}") # Multiply only non-None values result[mask] = arr[mask].astype(float) * val - logging.debug(f"result after multiplication: {result}") + # logging.debug(f"result after multiplication: {result}") return result.tolist() @@ -266,19 +382,25 @@ def get_crs_obj( logging.error("@get_crs_obj no Epc file given") else: crs_list = search_attribute_matching_name(context_obj, r"\.*Crs", search_in_sub_obj=True, deep_search=False) - if crs_list is not None and len(crs_list) > 0: + if crs_list is not None and len(crs_list) > 0 and crs_list[0] is not None: # logging.debug(crs_list[0]) crs = workspace.get_object(get_obj_uri(crs_list[0])) + logging.debug(f"CRS found for {get_obj_title(context_obj)} ({type(context_obj).__name__}): {crs}") if crs is None: - crs = workspace.get_object_by_uuid(get_obj_uuid(crs_list[0])) + # logging.debug(f"CRS {crs_list[0]} not found (or not read correctly)") + _crs_list = workspace.get_object_by_uuid(get_obj_uuid(crs_list[0])) + crs = _crs_list[0] if _crs_list is not None and len(_crs_list) > 0 else None if crs is None: logging.error(f"CRS {crs_list[0]} not found (or not read correctly)") raise ObjectNotFoundNotError(get_obj_uri(crs_list[0])) if crs is not None: return crs + else: + logging.debug(f"No CRS found for {get_obj_title(context_obj)} with type {type(context_obj).__name__}") if context_obj != root_obj: - upper_path = path_in_root[: path_in_root.rindex(".")] + upper_path = path_parent_attribute(path_in_root) + # upper_path = path_in_root[: path_in_root.rindex(".")] if len(upper_path) > 0: return get_crs_obj( context_obj=get_object_attribute(root_obj, upper_path), @@ -290,6 +412,275 @@ def get_crs_obj( return None +def linear_interpolation(md_target, md_start, md_end, p_start, p_end): + """ + Calcule la position 3D par interpolation linéaire simple. + Utilisé quand Continuity = 0 ou quand les TangentVectors sont absents. + """ + # Calcul du ratio de progression (0 à 1) + h = md_end - md_start + if h == 0: + return p_start + + t = (md_target - md_start) / h + + # Formule : P = P_start + t * (P_end - p_start) + p_target = p_start + t * (p_end - p_start) + + return p_target + + +def hermite_interpolation(md_target, md_start, md_end, p_start, p_end, v_start, v_end): + """ + Calcule la position 3D d'un point sur une trajectoire de puits via une Spline d'Hermite. + + Cette fonction est particulièrement adaptée aux objets RESQML de type + 'ParametricLineGeometry' avec une continuité C1. + + Args: + md_target (float): La profondeur mesurée (Measured Depth) cible à interpoler. + md_start (float): MD du point de contrôle précédent (Knot i). + md_end (float): MD du point de contrôle suivant (Knot i+1). + p_start (np.array): Coordonnées [X, Y, Z] au point md_start. + p_end (np.array): Coordonnées [X, Y, Z] au point md_end. + v_start (np.array): Vecteur tangente unitaire [dx, dy, dz] au point md_start. + v_end (np.array): Vecteur tangente unitaire [dx, dy, dz] au point md_end. + + Returns: + np.array: Un tableau numpy [X, Y, Z] représentant la position interpolée. + + Raises: + ValueError: Si md_start et md_end sont identiques (division par zéro). + AssertionError: Si md_target n'est pas compris dans l'intervalle [md_start, md_end]. + """ + + # 1. Vérification de l'intervalle + if not (md_start <= md_target <= md_end): + # Note : Dans certains cas de forage réel, on peut extrapoler, + # mais pour un WellboreFrame, on reste normalement dans les clous. + raise AssertionError("Le MD cible doit être compris entre md_start et md_end.") + + # Distance entre les deux points de contrôle + h = md_end - md_start + if h == 0: + raise ValueError("md_start et md_end ne peuvent pas être identiques.") + + # 2. Normalisation du paramètre t (0 <= t <= 1) + t = (md_target - md_start) / h + t2 = t * t + t3 = t2 * t + + # 3. Mise à l'échelle des vecteurs tangentes (scaling par la distance) + # En RESQML, les TangentVectors sont souvent unitaires ou normalisés. + # Pour l'interpolation cubique, ils doivent représenter la dérivée par rapport à t. + T_start = v_start * h + T_end = v_end * h + + # 4. Calcul des polynômes de base d'Hermite + h00 = 2 * t3 - 3 * t2 + 1 # Coefficient pour p_start + h10 = t3 - 2 * t2 + t # Coefficient pour T_start + h01 = -2 * t3 + 3 * t2 # Coefficient pour p_end + h11 = t3 - t2 # Coefficient pour T_end + + # 5. Combinaison linéaire pour obtenir la position P(t) + p_target = (h00 * p_start) + (h10 * T_start) + (h01 * p_end) + (h11 * T_end) + + return p_target + + +def get_wellbore_points( + mds: Optional[np.ndarray], + traj_mds: Optional[np.ndarray], + traj_points: Optional[np.ndarray], + traj_tangents: Optional[np.ndarray], + step_meters: float = 5.0, +) -> np.ndarray: + """ + mds : MDs du WellboreFrame + traj_mds : MDs de la trajectoire (ControlPointParameters) + traj_points : Points XYZ de la trajectoire + traj_tangents : Tangentes XYZ (Optionnel) + step_meters : Distance entre chaque point de la trajectoire lisse (Optionnel) + """ + if mds is None or len(mds) == 0: + if traj_mds is None or traj_points is None or traj_tangents is None: + raise ValueError( + "To generate a smooth trajectory, traj_mds, traj_points and traj_tangents must be provided." + ) + return generate_smooth_trajectory( + traj_mds=traj_mds, traj_points=traj_points, traj_tangents=traj_tangents, step_meters=step_meters + ) + + results = [] + + for m in mds: + # 1. Trouver l'intervalle + idx = np.searchsorted(traj_mds, m) - 1 + + # Gestion des bords + if idx < 0: + results.append(traj_points[0]) + continue + if idx >= len(traj_mds) - 1: + results.append(traj_points[-1]) + continue + + # 2. Extraire les bornes + p_s, p_e = traj_points[idx], traj_points[idx + 1] + m_s, m_e = traj_mds[idx], traj_mds[idx + 1] + + # 3. Choisir la méthode + if traj_tangents is not None: + # Cas ParametricLineGeometry C1+ + v_s, v_e = traj_tangents[idx], traj_tangents[idx + 1] + p_3d = hermite_interpolation(m, m_s, m_e, p_s, p_e, v_s, v_e) + else: + # Cas Linear ou PointGeometry + p_3d = linear_interpolation(m, m_s, m_e, p_s, p_e) + + results.append(p_3d) + + return np.array(results) + + +def generate_smooth_trajectory( + traj_mds: np.ndarray, traj_points: np.ndarray, traj_tangents: np.ndarray, step_meters: float = 5.0 +) -> np.ndarray: + """ + Generates a high-resolution polyline for the trajectory by sampling + it at a regular interval. + + Args: + traj_mds: MDs of control points from HDF5. + traj_points: Control points (N, 3) from HDF5. + traj_tangents: Tangent vectors (N, 3) from HDF5. + step_meters: Desired distance between each point of the final polyline. + + Returns: + A (M, 3) numpy array representing the smooth 3D polyline. + """ + # 1. Create a regular MD sampling from min to max MD + md_min, md_max = traj_mds[0], traj_mds[-1] + # We create a new set of MDs every 'step_meters' + sampled_mds = np.arange(md_min, md_max, step_meters) + + # Ensure the last point of the trajectory is included + if sampled_mds[-1] < md_max: + sampled_mds = np.append(sampled_mds, md_max) + + # 2. Reuse our interpolation logic + smooth_points = [] + for m in sampled_mds: + # Find the interval in the original control points + idx = np.searchsorted(traj_mds, m) - 1 + idx = max(0, min(idx, len(traj_mds) - 2)) + + p_3d = hermite_interpolation( + m, + traj_mds[idx], + traj_mds[idx + 1], + traj_points[idx], + traj_points[idx + 1], + traj_tangents[idx], + traj_tangents[idx + 1], + ) + smooth_points.append(p_3d) + + return np.array(smooth_points) + + +def generate_vertical_well_points( + wellbore_mds: np.ndarray, head_x: float, head_y: float, head_z: float, z_increasing_downward: bool = False +) -> np.ndarray: + """ + Generates local 3D coordinates for a perfectly vertical wellbore. + + Args: + wellbore_mds: (N,) array of Measured Depths from the WellboreFrame. + head_x: The X coordinate of the MdDatum (well head) in Local CRS. + head_y: The Y coordinate of the MdDatum (well head) in Local CRS. + head_z: The Z coordinate of the MdDatum (well head) in Local CRS. + + Returns: + (N, 3) numpy array of points [X, Y, Z] in Local CRS. + """ + num_points = len(wellbore_mds) + # Initialize the array with (N, 3) + local_points = np.zeros((num_points, 3)) + + # In a vertical well, X and Y are constant and equal to the head position + local_points[:, 0] = head_x + local_points[:, 1] = head_y + + # The MD (Measured Depth) represents the distance traveled from MD 0. + # In a vertical well, Z_point = Z_datum + (MD_point - MD_datum_at_0) + # Most of the time, MD at head is 0. + # If wellbore_mds start at 0, Z starts at head_z. + # if z_increasing_downward is False, we add the MD to head_z, otherwise we subtract it. + md_start = wellbore_mds[0] + if z_increasing_downward: + local_points[:, 2] = head_z - (wellbore_mds - md_start) + else: + local_points[:, 2] = head_z + (wellbore_mds - md_start) + + return local_points + + +def read_parametric_geometry( + geometry: Any, workspace: Optional[EnergymlStorageInterface] = None +) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]]: + """Read a ParametricLineGeometry and return the controle point parameters, control points, and tangents.""" + if geometry is None: + raise ValueError("Geometry object is None") + + knot_count = getattr(geometry, "knot_count", None) + + traj_mds = read_array( + energyml_array=getattr(geometry, "control_point_parameters"), + root_obj=geometry, + workspace=workspace, + ) + if not isinstance(traj_mds, np.ndarray): + traj_mds = np.array(traj_mds) + + traj_points = read_array( + energyml_array=getattr(geometry, "control_points"), + root_obj=geometry, + workspace=workspace, + ) + if not isinstance(traj_points, np.ndarray): + traj_points = np.array(traj_points) + traj_points = traj_points.reshape(-1, 3) + + traj_tangents = None + try: + traj_tangents = read_array( + energyml_array=getattr(geometry, "tangent_vectors"), + root_obj=geometry, + workspace=workspace, + ) + except Exception as e: + logging.debug(f"No tangent vectors found for {geometry}, fallback to linear interpolation: {e}") + + if traj_tangents is not None: + if not isinstance(traj_tangents, np.ndarray): + traj_tangents = np.array(traj_tangents) + traj_tangents = traj_tangents.reshape(-1, 3) + + # verif with knot_count if exists + if knot_count is not None: + if ( + len(traj_mds) != knot_count + or len(traj_points) != knot_count + or (traj_tangents is not None and len(traj_tangents) != knot_count) + ): + logging.warning( + f"Mismatch between knot_count ({knot_count}) and actual control points count (mds: {len(traj_mds)}, points: {len(traj_points)}, tangents: {len(traj_tangents) if traj_tangents is not None else 'N/A'})" + ) + + return traj_mds, traj_points, traj_tangents + + # ___ # / | ______________ ___ _______ # / /| | / ___/ ___/ __ `/ / / / ___/ @@ -309,12 +700,12 @@ def _array_name_mapping(array_type_name: str) -> str: return "ConstantArray" elif "External" in array_type_name or "Hdf5" in array_type_name: return "ExternalArray" - elif array_type_name.endswith("XmlArray"): + elif "Xml" in array_type_name: return "XmlArray" elif "Jagged" in array_type_name: return "JaggedArray" elif "Lattice" in array_type_name: - if "Integer" in array_type_name or "Double" in array_type_name: + if "Integer" in array_type_name or "Double" in array_type_name or "Floating" in array_type_name: return "int_double_lattice_array" return array_type_name @@ -335,6 +726,60 @@ def get_not_supported_array(): return [x for x in _ARRAY_NAMES_ if get_array_reader_function(_array_name_mapping(x)) is None] +def _extract_external_data_array_part_params( + obj: Any, +) -> tuple[Optional[List[int]], Optional[List[int]], Optional[str]]: + """ + Extract array parameters (Count, StartIndex, URI) from an object. + Uses regex to match various attribute name formats (snake_case, PascalCase). + + Args: + obj: The object to extract parameters from (ExternalDataArrayPart or parent object) + + Returns: + Tuple of (start_indices, counts, external_uri) + """ + start_indices = None + counts = None + external_uri = None + + # Extract StartIndex using regex (matches: StartIndex, start_index, startIndex) + start_attr = get_object_attribute_rgx(obj, "[Ss]tart[_]?[Ii]ndex") + if start_attr is not None: + if isinstance(start_attr, list): + start_indices = start_attr + elif isinstance(start_attr, (int, float)): + start_indices = [int(start_attr)] + elif hasattr(start_attr, "value"): + if isinstance(start_attr.value, list): + start_indices = start_attr.value + elif isinstance(start_attr.value, (int, float)): + start_indices = [int(start_attr.value)] + + # Extract Count using regex (matches: Count, count, NodeCount, node_count) + count_attr = get_object_attribute_rgx(obj, "([Nn]ode[_]?)?[Cc]ount") + if count_attr is not None: + if isinstance(count_attr, list): + counts = count_attr + elif isinstance(count_attr, (int, float)): + counts = [int(count_attr)] + elif hasattr(count_attr, "value"): + if isinstance(count_attr.value, list): + counts = count_attr.value + elif isinstance(count_attr.value, (int, float)): + counts = [int(count_attr.value)] + + # Extract URI using regex (matches: URI, uri) + uri_attr = get_object_attribute_rgx(obj, "[Uu][Rr][Ii]") + if uri_attr is not None: + if isinstance(uri_attr, str): + external_uri = uri_attr + elif hasattr(uri_attr, "value") and isinstance(uri_attr.value, str): + external_uri = uri_attr.value + + return start_indices, counts, external_uri + + def read_external_array( energyml_array: Any, root_obj: Optional[Any] = None, @@ -344,32 +789,80 @@ def read_external_array( ) -> Optional[Union[List[Any], np.ndarray]]: """ Read an external array (BooleanExternalArray, BooleanHdf5Array, DoubleHdf5Array, IntegerHdf5Array, StringExternalArray ...) + Automatically handles RESQML v2.2 (multiple ExternalDataArrayPart with individual parameters) + and RESQML v2.0.1 (count from parent object). + :param energyml_array: :param root_obj: :param path_in_root: :param workspace: + :param sub_indices: :return: """ array = None if workspace is not None: - # array = workspace.read_external_array( - # energyml_array=energyml_array, - # root_obj=root_obj, - # path_in_root=path_in_root, - # ) - crs = get_crs_obj( - context_obj=root_obj, - root_obj=root_obj, - path_in_root=path_in_root, - workspace=workspace, + crs = None + try: + get_crs_obj( + context_obj=root_obj, + root_obj=root_obj, + path_in_root=path_in_root, + workspace=workspace, + ) + except ObjectNotFoundNotError as e: + logging.debug(f"CRS not found for {get_obj_title(root_obj)}: {e}") + + # Search for ExternalDataArrayPart type objects (RESQML v2.2) + external_parts = search_attribute_matching_type( + energyml_array, "ExternalDataArrayPart", return_self=False, deep_search=True ) - pief_list = get_path_in_external_with_path(obj=energyml_array) - # empty array - array = None - for pief_path_in_obj, pief in pief_list: - arr = workspace.read_array(proxy=crs or root_obj, path_in_external=pief) - if arr is not None: - array = arr if array is None else np.concatenate((array, arr)) + + if external_parts and len(external_parts) > 0: + # RESQML v2.2: Loop over each ExternalDataArrayPart + # Each part has its own start/count/uri and path_in_external + for ext_part in external_parts: + start_indices, counts, external_uri = _extract_external_data_array_part_params(ext_part) + pief_list = get_path_in_external_with_path(obj=ext_part) + # logging.debug(f"Pief : {pief_list}") + for pief_path_in_obj, pief in pief_list: + arr = workspace.read_array( + proxy=crs or root_obj, + path_in_external=pief, + start_indices=start_indices, + counts=counts, + external_uri=external_uri, + ) + if arr is not None: + array = arr if array is None else np.concatenate((array, arr)) + # logging.debug(f"\t ExternalDataArrayPart read successfully. arr : {arr} : array : {array}") + else: + # RESQML v2.0.1: Extract count from parent object, no StartIndex or URI + counts = None + if path_in_root and root_obj: + last_attr = path_last_attribute(path_in_root) + if last_attr: + parent_path = path_in_root[: path_in_root.rfind("." + last_attr)] + if parent_path: + try: + parent_obj = get_object_attribute_advanced(root_obj, parent_path) + if parent_obj: + # Extract count from parent using simplified function + _, counts, _ = _extract_external_data_array_part_params(parent_obj) + except Exception as e: + logging.debug(f"Failed to extract count from parent: {e}") + + # Read array using path_in_external from the array object itself + pief_list = get_path_in_external_with_path(obj=energyml_array) + for pief_path_in_obj, pief in pief_list: + arr = workspace.read_array( + proxy=crs or root_obj, + path_in_external=pief, + start_indices=None, + counts=counts, + external_uri=None, + ) + if arr is not None: + array = arr if array is None else np.concatenate((array, arr)) else: array = read_external_dataset_array( @@ -385,6 +878,7 @@ def read_external_array( # Fallback for non-numpy arrays array = [array[idx] for idx in sub_indices] + # logging.debug(f"External array read successfully. => {array}") return array @@ -416,8 +910,26 @@ def read_array( :param sub_indices: for SubRepresentation :return: """ - if isinstance(energyml_array, list): + if isinstance(energyml_array, np.ndarray): + # if isinstance(energyml_array, list): return energyml_array + elif isinstance(energyml_array, list): + # logging.debug("Warning: the array is a list, not a numpy array, be careful with the performance !") + # logging.debug(energyml_array) + if len(energyml_array) > 0 and is_primitive(energyml_array[0]): + return energyml_array + else: + return [ + read_array( + energyml_array=elem, + root_obj=root_obj, + path_in_root=path_in_root, + workspace=workspace, + sub_indices=sub_indices, + ) + for elem in energyml_array + if elem is not None + ] array_type_name = _array_name_mapping(type(energyml_array).__name__) reader_func = get_array_reader_function(array_type_name) @@ -442,9 +954,15 @@ def read_constant_array( path_in_root: Optional[str] = None, workspace: Optional[EnergymlStorageInterface] = None, sub_indices: Optional[Union[List[int], np.ndarray]] = None, -) -> List[Any]: +) -> Union[np.ndarray, List[Any]]: """ - Read a constant array ( BooleanConstantArray, DoubleConstantArray, FloatingPointConstantArray, IntegerConstantArray ...) + Read a constant array (BooleanConstantArray, DoubleConstantArray, + FloatingPointConstantArray, IntegerConstantArray …). + + For numeric (int / float / bool) values a ``numpy.ndarray`` is returned + via :func:`numpy.full`, avoiding a Python-list allocation. String values + fall back to a plain list because numpy object arrays add no benefit. + :param energyml_array: :param root_obj: :param path_in_root: @@ -452,8 +970,6 @@ def read_constant_array( :param sub_indices: :return: """ - # logging.debug(f"Reading constant array\n\t{energyml_array}") - value = get_object_attribute_no_verif(energyml_array, "value") count = ( len(sub_indices) @@ -461,9 +977,10 @@ def read_constant_array( else get_object_attribute_no_verif(energyml_array, "count") ) - # logging.debug(f"\tValue : {[value for i in range(0, count)]}") - - return [value] * count + if isinstance(value, (int, float, bool, np.integer, np.floating)): + return np.full(int(count), value) + # Non-numeric (e.g. string) — keep as Python list. + return [value] * int(count) def read_xml_array( @@ -482,8 +999,10 @@ def read_xml_array( :param sub_indices: :return: """ + values = get_object_attribute_no_verif(energyml_array, "values") # count = get_object_attribute_no_verif(energyml_array, "count_per_value") + # logging.debug("values: ", values) if sub_indices is not None and len(sub_indices) > 0: if isinstance(values, np.ndarray): @@ -549,23 +1068,49 @@ def read_int_double_lattice_array( :param sub_indices: :return: """ - start_value = get_object_attribute_no_verif(energyml_array, "start_value") + start_value = int(get_object_attribute_no_verif(energyml_array, "start_value")) offset = get_object_attribute_no_verif(energyml_array, "offset") - result = [] + if len(offset) == 0: + raise Exception(f"{type(energyml_array)} has no offset — cannot generate indices") if len(offset) == 1: - # 1D lattice array: offset is a single DoubleConstantArray or IntegerConstantArray + # 1D lattice: start_value, start_value+v, start_value+2v, … (count+1 values) offset_obj = offset[0] - - # Get the offset value and count from the ConstantArray offset_value = get_object_attribute_no_verif(offset_obj, "value") - count = get_object_attribute_no_verif(offset_obj, "count") - - # Generate the 1D array: start_value + i * offset_value for i in range(count) - result = [start_value + i * offset_value for i in range(count)] + count = int(get_object_attribute_no_verif(offset_obj, "count")) + result = [start_value + i * offset_value for i in range(count + 1)] else: - raise Exception(f"{type(energyml_array)} read with an offset of length {len(offset)} is not supported") + # N-D lattice (N ≥ 2) — used for NodeIndicesOnSupportingRepresentation. + # + # Each Offset[k] is an IntegerConstantArray with: + # Count = number of *steps* along axis k → grid size = Count+1 + # Value = stride multiplier for axis k + # + # Flat index formula (C/row-major order): + # flat_idx(i0, i1, …) = StartValue + # + i0 * Value[0] * (Count[1]+1) * (Count[2]+1) * … + # + i1 * Value[1] * (Count[2]+1) * … + # + … + # + iN-1 * Value[N-1] + # + # i.e. stride[k] = Value[k] * prod(Count[m]+1 for m in range(k+1, N)) + N = len(offset) + counts = [int(get_object_attribute_no_verif(off, "count")) for off in offset] + values = [int(get_object_attribute_no_verif(off, "value")) for off in offset] + + strides = [] + for k in range(N): + s = values[k] + for m in range(k + 1, N): + s *= counts[m] + 1 + strides.append(s) + + # np.indices gives shape (N, d0, d1, …) + shape = tuple(c + 1 for c in counts) + idx_grids = np.indices(shape) # (N, *shape) + flat_indices = start_value + sum(idx_grids[k] * strides[k] for k in range(N)) + result = flat_indices.ravel().tolist() return result @@ -638,9 +1183,23 @@ def read_point3d_from_representation_lattice_array( sub_indices: Optional[Union[List[int], np.ndarray]] = None, ): """ - Read a Point3DFromRepresentationLatticeArray. + Read a ``Point3DFromRepresentationLatticeArray``. + + The XY(Z) positions are borrowed from a *supporting* ``Grid2DRepresentation`` + by selecting its nodes via the flat indices described in + ``NodeIndicesOnSupportingRepresentation`` (an ``IntegerLatticeArray``). + + The index formula for an N-dimensional ``IntegerLatticeArray`` is row-major: + + stride[k] = Value[k] * prod(Count[m]+1 for m in range(k+1, N)) + flat_idx(i, j, …) = StartValue + i*stride[0] + j*stride[1] + … + + Example — supporting rep 2×4, ``Offset[0]={Count=1, Value=1}``, + ``Offset[1]={Count=3, Value=1}``: + stride[0] = 1 * 4 = 4, stride[1] = 1 + flat_idx(i, j) = 4i + j → [0,1,2,3,4,5,6,7] - Note: Only works for Grid2DRepresentation. + Note: Only ``Grid2DRepresentation`` supporting reps are currently supported. :param energyml_array: :param root_obj: @@ -649,25 +1208,79 @@ def read_point3d_from_representation_lattice_array( :param sub_indices: :return: """ - supporting_rep_identifier = get_obj_uri(get_object_attribute_no_verif(energyml_array, "supporting_representation")) - # logging.debug(f"energyml_array : {energyml_array}\n\t{supporting_rep_identifier}") + supporting_rep_dor = get_object_attribute_no_verif(energyml_array, "supporting_representation") + supporting_rep_identifier = get_obj_uri(supporting_rep_dor) supporting_rep = workspace.get_object(supporting_rep_identifier) if workspace is not None else None - # TODO chercher un pattern \.*patch\.*.[d]+ pour trouver le numero du patch dans le path_in_root puis lire le patch - # logging.debug(f"path_in_root {path_in_root}") + if supporting_rep is None and workspace is not None: + from energyml.utils.introspection import get_obj_uuid - result = [] - if "grid2d" in str(type(supporting_rep)).lower(): - patch_path, patch = search_attribute_matching_name_with_path(supporting_rep, "Grid2dPatch")[0] - points = read_grid2d_patch( - patch=patch, grid2d=supporting_rep, path_in_root=patch_path, workspace=workspace, sub_indices=sub_indices + candidates = workspace.get_object_by_uuid(get_obj_uuid(supporting_rep_dor)) + supporting_rep = candidates[0] if candidates else None + + if supporting_rep is None: + raise Exception(f"Supporting representation {supporting_rep_identifier} not found in workspace") + + if "grid2d" not in str(type(supporting_rep)).lower(): + raise Exception( + f"Unsupported supporting rep type {type(supporting_rep).__name__} " f"for {type(energyml_array).__name__}" + ) + + # ── 1. Read ALL points from the supporting representation ──────────────── + # RESQML 2.0.1 uses Grid2dPatch; RESQML 2.2 stores geometry directly. + all_sup_points: Optional[np.ndarray] = None + + patch_matches = search_attribute_matching_name_with_path(supporting_rep, "Grid2dPatch") + if patch_matches: + patch_path, patch = patch_matches[0] + all_sup_points = read_grid2d_patch( + patch=patch, + grid2d=supporting_rep, + path_in_root=patch_path, + workspace=workspace, ) - # TODO: take the points by there indices from the NodeIndicesOnSupportingRepresentation - result = points + else: + # RESQML 2.2: geometry is directly on the representation + geom_points_matches = search_attribute_matching_name_with_path(supporting_rep, "Geometry.Points") + if not geom_points_matches: + raise Exception(f"Cannot find points in supporting rep {type(supporting_rep).__name__}") + geom_path, geom_points_obj = geom_points_matches[0] + all_sup_points = read_array( + energyml_array=geom_points_obj, + root_obj=supporting_rep, + path_in_root=geom_path, + workspace=workspace, + ) + + if not isinstance(all_sup_points, np.ndarray): + all_sup_points = np.array(all_sup_points, dtype=float) + all_sup_points = all_sup_points.reshape(-1, 3) + # ── 2. Generate the node index list from the IntegerLatticeArray ───────── + node_idx_arr = get_object_attribute_no_verif(energyml_array, "node_indices_on_supporting_representation") + if node_idx_arr is None: + node_idx_arr = get_object_attribute_rgx(energyml_array, "NodeIndices") + + if node_idx_arr is not None: + node_indices = read_array( + energyml_array=node_idx_arr, + root_obj=root_obj, + path_in_root=path_in_root, + workspace=workspace, + ) + node_indices = np.asarray(node_indices, dtype=np.int64) + result = all_sup_points[node_indices] else: - raise Exception(f"Not supported type {type(energyml_array)} for object {type(root_obj)}") - # pour trouver les infos qu'il faut + # No index array: use all points in order (identity mapping) + logging.debug( + "Point3DFromRepresentationLatticeArray: no NodeIndices found, " "using all supporting rep points in order" + ) + result = all_sup_points + + # ── 3. Optional sub-selection (SubRepresentation) ──────────────────────── + if sub_indices is not None and len(sub_indices) > 0: + result = result[np.asarray(sub_indices, dtype=np.int64)] + return result @@ -699,7 +1312,9 @@ def read_point3d_lattice_array( """ Read a Point3DLatticeArray. - Note: If a CRS is found and its 'ZIncreasingDownward' is set to true or its + Accumulates origin + cumulative slowest/fastest offset vectors into an + (N, 3) float64 array. CRS transforms (z-flip, offsets, rotation) are the + responsibility of the caller — this function is CRS-neutral. :param energyml_array: :param root_obj: @@ -730,19 +1345,6 @@ def read_point3d_lattice_array( current_path=path_in_root or "", ) - crs = None - try: - crs = get_crs_obj( - context_obj=energyml_array, - path_in_root=path_in_root, - root_obj=root_obj, - workspace=workspace, - ) - except ObjectNotFoundNotError: - logging.error("No CRS found, not able to check zIncreasingDownward") - - zincreasing_downward = is_z_reversed(crs) - slowest_vec = _point_as_array(get_object_attribute_rgx(slowest, "offset|direction")) slowest_spacing = read_array(get_object_attribute_no_verif(slowest, "spacing")) slowest_table = list(map(lambda x: prod_n_tab(x, slowest_vec), slowest_spacing)) @@ -756,7 +1358,7 @@ def read_point3d_lattice_array( logging.debug(f"slowest vector: {slowest_vec}, spacing: {slowest_spacing}, size: {slowest_size}") logging.debug(f"fastest vector: {fastest_vec}, spacing: {fastest_spacing}, size: {fastest_size}") - logging.debug(f"origin: {origin}, zincreasing_downward: {zincreasing_downward}") + logging.debug(f"origin: {origin}") if crs_sa_count is not None and len(crs_sa_count) > 0 and crs_fa_count is not None and len(crs_fa_count) > 0: if (crs_sa_count[0] == fastest_size and crs_fa_count[0] == slowest_size) or ( @@ -779,38 +1381,40 @@ def read_point3d_lattice_array( try: # Convert tables to NumPy arrays origin_arr = np.array(origin, dtype=float) - slowest_arr = np.array(slowest_table, dtype=float) # shape: (slowest_size, 3) - fastest_arr = np.array(fastest_table, dtype=float) # shape: (fastest_size, 3) - - # Compute cumulative sums - slowest_cumsum = np.cumsum(slowest_arr, axis=0) # cumulative offset along slowest axis - fastest_cumsum = np.cumsum(fastest_arr, axis=0) # cumulative offset along fastest axis + slowest_arr = np.array(slowest_table, dtype=float) # shape: (slowest_size-1, 3) + fastest_arr = np.array(fastest_table, dtype=float) # shape: (fastest_size-1, 3) + + # Sanity: spacing arrays must have exactly (size-1) rows. + # For well-formed RESQML data this is always true; bail out to the + # iterative fallback if someone passes malformed data. + if slowest_arr.shape[0] != slowest_size - 1 or fastest_arr.shape[0] != fastest_size - 1: + raise ValueError( + f"Spacing array length mismatch: " + f"slowest={slowest_arr.shape[0]} expected {slowest_size - 1}, " + f"fastest={fastest_arr.shape[0]} expected {fastest_size - 1}" + ) - # Create meshgrid indices - i_indices, j_indices = np.meshgrid(np.arange(slowest_size), np.arange(fastest_size), indexing="ij") + # Compute cumulative sums (shape: (size-1, 3)) + slowest_cumsum = np.cumsum(slowest_arr, axis=0) + fastest_cumsum = np.cumsum(fastest_arr, axis=0) # Initialize result array result_arr = np.zeros((slowest_size, fastest_size, 3), dtype=float) result_arr[:, :, :] = origin_arr # broadcast origin to all positions - # Add offsets based on zincreasing_downward - if zincreasing_downward: - # Add slowest offsets where i > 0 - result_arr[1:, :, :] += slowest_cumsum[:-1, np.newaxis, :] - # Add fastest offsets where j > 0 - result_arr[:, 1:, :] += fastest_cumsum[np.newaxis, :-1, :] - else: - # Add fastest offsets where j > 0 - result_arr[:, 1:, :] += fastest_cumsum[np.newaxis, :-1, :] - # Add slowest offsets where i > 0 - result_arr[1:, :, :] += slowest_cumsum[:-1, np.newaxis, :] + # Accumulate offsets: + # result_arr[:, j, :] += fastest_cumsum[j-1] for j in 1..fastest_size-1 + # result_arr[i, :, :] += slowest_cumsum[i-1] for i in 1..slowest_size-1 + result_arr[:, 1:, :] += fastest_cumsum[np.newaxis, :, :] # (1, fast-1, 3) + result_arr[1:, :, :] += slowest_cumsum[:, np.newaxis, :] # (slow-1, 1, 3) - # Flatten to list of points - result = result_arr.reshape(-1, 3).tolist() + # Return the (N, 3) float64 numpy array directly — no .tolist(). + result = result_arr.reshape(-1, 3) except (ValueError, TypeError) as e: - # Fallback to original implementation if NumPy conversion fails + # Fallback to original implementation if NumPy conversion fails. logging.warning(f"NumPy vectorization failed ({e}), falling back to iterative approach") + fallback: List = [] for i in range(slowest_size): for j in range(fastest_size): previous_value = origin @@ -818,31 +1422,25 @@ def read_point3d_lattice_array( if j > 0: if i > 0: line_idx = i * fastest_size - previous_value = result[line_idx + j - 1] - else: - previous_value = result[j - 1] - if zincreasing_downward: - result.append(sum_lists(previous_value, slowest_table[i - 1])) + previous_value = fallback[line_idx + j - 1] else: - result.append(sum_lists(previous_value, fastest_table[j - 1])) + previous_value = fallback[j - 1] + fallback.append(sum_lists(previous_value, fastest_table[j - 1])) else: if i > 0: prev_line_idx = (i - 1) * fastest_size - previous_value = result[prev_line_idx] - if zincreasing_downward: - result.append(sum_lists(previous_value, fastest_table[j - 1])) - else: - result.append(sum_lists(previous_value, slowest_table[i - 1])) + previous_value = fallback[prev_line_idx] + fallback.append(sum_lists(previous_value, slowest_table[i - 1])) else: - result.append(previous_value) + fallback.append(previous_value) + # Convert fallback list to ndarray to keep the return type consistent. + result = np.array(fallback, dtype=np.float64).reshape(-1, 3) else: raise Exception(f"{type(energyml_array)} read with an offset of length {len(offset)} is not supported") if sub_indices is not None and len(sub_indices) > 0: - if isinstance(result, np.ndarray): - result = result[sub_indices].tolist() - else: - result = [result[idx] for idx in sub_indices] + # result is always an ndarray here; index directly without .tolist(). + result = result[np.asarray(sub_indices, dtype=np.int64)] return result @@ -854,3 +1452,451 @@ def read_point3d_lattice_array( # workspace: Optional[EnergymlStorageInterface] = None # ): # logging.debug(energyml_array) + + +# ______ __ _ __ __ +# / ____/________ _____ / /_ (_)________ _/ / _________ / /___ __________ +# / / __/ ___/ __ `/ __ \/ __ \/ / ___/ __ `/ / / ___/ __ \/ / __ \/ ___/ ___/ +# / /_/ / / / /_/ / /_/ / / / / / /__/ /_/ / / / /__/ /_/ / / /_/ / / (__ ) +# \____/_/ \__,_/ .___/_/ /_/_/\___/\__,_/_/ \___/\____/_/\____/_/ /____/ +# /_/ + +# =========================== +# PyVista integration snippet +# =========================== + +# from energyml.utils.data.helper import ( +# read_graphical_rendering_info, read_property +# ) + +# # 1. Load objects +# gis = workspace.get_object(gis_uri) +# prop = workspace.get_object(prop_uri) +# prop_uuid = get_obj_uuid(prop) + +# # 2. Extract rendering info +# info = read_graphical_rendering_info(gis, prop_uuid, workspace) + +# # 3. Read scalar values +# scalars = read_array(prop.values_for_patch[0], root_obj=prop, workspace=workspace) + +# # 4. Build PyVista LUT +# import pyvista as pv +# if info and info.color_map: +# lut = pv.LookupTable() +# lut.values = info.color_map.to_vtk_lut() # (256,4) RGBA +# if info.color_min_max: +# lut.scalar_range = info.color_min_max +# mesh.plot(scalars=scalars, cmap=lut) +# elif info and info.constant_color: +# c = info.constant_color +# mesh.plot(color=(c.r, c.g, c.b), opacity=c.a) +# HsvColor: hue [0,360], saturation [0,1], value [0,1], alpha [0,1], title +# MinMax: minimum: float, maximum: float + +import colorsys +from dataclasses import dataclass, field as dc_field + + +# ───────────────────────────────────────────────────────────────────────────── +# Unified output data structures +# ───────────────────────────────────────────────────────────────────────────── + + +@dataclass +class RgbaColor: + """RGBA colour with channels in [0.0, 1.0].""" + + r: float + g: float + b: float + a: float = 1.0 + + def to_uint8(self) -> Tuple[int, int, int, int]: + """Return (R, G, B, A) in [0, 255] - ready for VTK / PyVista.""" + return ( + int(round(self.r * 255)), + int(round(self.g * 255)), + int(round(self.b * 255)), + int(round(self.a * 255)), + ) + + @staticmethod + def from_hsv(hsv_obj: Any) -> "RgbaColor": + """Convert a RESQML ``HsvColor`` to :class:`RgbaColor`.""" + h = (hsv_obj.hue or 0.0) / 360.0 # RESQML hue is [0, 360] + s = hsv_obj.saturation or 0.0 + v = hsv_obj.value or 0.0 + a = hsv_obj.alpha if hsv_obj.alpha is not None else 1.0 + r, g, b = colorsys.hsv_to_rgb(h, s, v) + return RgbaColor(r, g, b, a) + + @staticmethod + def random() -> "RgbaColor": + """Generate a random RGBA color (for testing).""" + import random + + return RgbaColor( + r=random.random(), + g=random.random(), + b=random.random(), + a=1.0, + ) + + @staticmethod + def random_from_uuid(uuid_str: str) -> "RgbaColor": + """Generate a random RGBA color based on a UUID string (for consistent testing).""" + import random + import hashlib + + # Create a hash of the UUID string to seed the random generator + hash_bytes = hashlib.sha256(uuid_str.encode()).digest() + seed = int.from_bytes(hash_bytes, 'big') + random.seed(seed) + + return RgbaColor( + r=random.random(), + g=random.random(), + b=random.random(), + a=1.0, + ) + + +@dataclass +class ColorMapEntry: + """One control point: a scalar index mapped to an RGBA colour.""" + + index: float # float for both continuous and discrete (int index cast to float) + color: RgbaColor + + +@dataclass +class ColorMapInfo: + """ + Unified representation of a RESQML color map, directly usable by PyVista/VTK. + + Covers both :class:`ContinuousColorMap` and :class:`DiscreteColorMap`. + + PyVista usage example:: + + info = read_color_map(my_continuous_color_map) + lut = pv.LookupTable() + lut.values = info.to_vtk_lut() # (256, 4) uint8 RGBA array + lut.scalar_range = (info.entries[0].index, info.entries[-1].index) + mesh.plot(scalars="my_property", cmap=lut) + """ + + is_continuous: bool + entries: List[ColorMapEntry] # sorted by ascending index + null_color: Optional[RgbaColor] = None + above_max_color: Optional[RgbaColor] = None + below_min_color: Optional[RgbaColor] = None + + def to_vtk_lut(self, n_colors: int = 256) -> np.ndarray: + """ + Return an ``(N, 4)`` uint8 RGBA array for use as a PyVista / VTK LUT. + + - For **continuous** maps: linearly interpolates the control-point + HSV colors over *n_colors* levels. + - For **discrete** maps: returns one row per entry (``n_colors`` + is ignored) so each integer index gets an exact color. + + :param n_colors: Number of samples for continuous maps (default 256). + :return: ``np.ndarray`` of shape ``(N, 4)``, dtype ``uint8``. + """ + if not self.entries: + return np.zeros((1, 4), dtype=np.uint8) + + sorted_entries = sorted(self.entries, key=lambda e: e.index) + + if not self.is_continuous: + # One exact row per integer entry - no interpolation needed. + return np.array( + [e.color.to_uint8() for e in sorted_entries], dtype=np.uint8 + ) + + # Continuous: sample n_colors levels with linear interpolation in RGBA. + indices = np.array([e.index for e in sorted_entries], dtype=np.float64) + float_colors = np.array( + [[e.color.r, e.color.g, e.color.b, e.color.a] for e in sorted_entries], + dtype=np.float64, + ) + t = np.linspace(indices[0], indices[-1], n_colors) + result = np.zeros((n_colors, 4), dtype=np.uint8) + for ch in range(4): + result[:, ch] = np.clip( + np.interp(t, indices, float_colors[:, ch]) * 255, 0, 255 + ).round().astype(np.uint8) + return result + + def scalar_range(self) -> Tuple[float, float]: + """Return ``(min_index, max_index)`` of the stored entries.""" + if not self.entries: + return (0.0, 1.0) + indices = [e.index for e in self.entries] + return (min(indices), max(indices)) + + +@dataclass +class ScalarRenderingInfo: + """ + All graphical rendering parameters needed to display a RESQML property or + representation in a 3D viewer (PyVista, VTK, etc.). + + Produced by :func:`read_graphical_rendering_info`. + + Typical PyVista workflow:: + + info = read_graphical_rendering_info(gis, prop_uuid, workspace) + scalars = read_property(prop, workspace) # np.ndarray + if info and info.color_map: + lut = pv.LookupTable() + lut.values = info.color_map.to_vtk_lut() + if info.color_min_max: + lut.scalar_range = info.color_min_max + mesh.plot(scalars=scalars, cmap=lut) + """ + + target_obj_uuid: str + + # ── Colour mapping (from ColorInformation → ColorMap) ──────────────────── + color_map: Optional[ColorMapInfo] = None + color_min_max: Optional[Tuple[float, float]] = None # clamp range for the LUT + color_use_log: bool = False + color_use_reverse: bool = False + color_value_vector_index: Optional[int] = None # component for vector props + + # ── Alpha / opacity mapping (from AlphaInformation) ────────────────────── + # Piecewise: list of (property_value, opacity [0..1]) control points + alpha_control_points: Optional[List[Tuple[float, float]]] = None + alpha_min_max: Optional[Tuple[float, float]] = None + alpha_use_log: bool = False + alpha_overwrite_color_alpha: bool = False + + # ── Size mapping (from SizeInformation) ────────────────────────────────── + size_min_max: Optional[Tuple[float, float]] = None # (min_size, max_size) + size_use_log: bool = False + size_value_vector_index: Optional[int] = None + + # ── Visibility / constant style (from DefaultGraphicalInformation) ──────── + is_visible: bool = True + constant_color: Optional[RgbaColor] = None + constant_alpha: Optional[float] = None # [0..1] global opacity override + + # ── Contour lines (from ContourLineSetInformation) ──────────────────────── + contour_increment: Optional[float] = None + contour_show_major_every: Optional[int] = None + + +# ───────────────────────────────────────────────────────────────────────────── +# Color-map readers (Group 1 - both return ColorMapInfo) +# ───────────────────────────────────────────────────────────────────────────── + + +def _optional_rgba(hsv_obj: Optional[Any]) -> Optional[RgbaColor]: + """Convert an optional ``HsvColor`` to :class:`RgbaColor`, or ``None``.""" + return RgbaColor.from_hsv(hsv_obj) if hsv_obj is not None else None + + +def read_continuous_color_map(color_map_obj: Any) -> ColorMapInfo: + """ + Read a RESQML ``ContinuousColorMap`` into a :class:`ColorMapInfo`. + + **Input**: a ``ContinuousColorMap`` xsdata dataclass instance (e.g. from + ``workspace.get_object(uri)``). + + **Output**: :class:`ColorMapInfo` with ``is_continuous=True`` and entries + sorted ascending by ``index`` (a ``float``). The ``to_vtk_lut()`` method + produces a ``(256, 4)`` uint8 RGBA array directly usable by PyVista. + """ + entries = sorted( + [ + ColorMapEntry(index=float(e.index), color=RgbaColor.from_hsv(e.hsv)) + for e in (color_map_obj.entry or []) + if e.index is not None and e.hsv is not None + ], + key=lambda ce: ce.index, + ) + return ColorMapInfo( + is_continuous=True, + entries=entries, + null_color=_optional_rgba(getattr(color_map_obj, "null_color", None)), + above_max_color=_optional_rgba(getattr(color_map_obj, "above_max_color", None)), + below_min_color=_optional_rgba(getattr(color_map_obj, "below_min_color", None)), + ) + + +def read_discrete_color_map(color_map_obj: Any) -> ColorMapInfo: + """ + Read a RESQML ``DiscreteColorMap`` into a :class:`ColorMapInfo`. + + **Input**: a ``DiscreteColorMap`` xsdata dataclass instance. + + **Output**: :class:`ColorMapInfo` with ``is_continuous=False`` and one + entry per integer code. ``to_vtk_lut()`` returns exactly one RGBA row per + entry - suitable for VTK's categorical lookup table + (``vtkLookupTable.SetAnnotation`` workflow). + """ + entries = sorted( + [ + ColorMapEntry(index=float(e.index), color=RgbaColor.from_hsv(e.hsv)) + for e in (color_map_obj.entry or []) + if e.index is not None and e.hsv is not None + ], + key=lambda ce: ce.index, + ) + return ColorMapInfo( + is_continuous=False, + entries=entries, + null_color=_optional_rgba(getattr(color_map_obj, "null_color", None)), + above_max_color=_optional_rgba(getattr(color_map_obj, "above_max_color", None)), + below_min_color=_optional_rgba(getattr(color_map_obj, "below_min_color", None)), + ) + + +def read_color_map(color_map_obj: Any) -> Optional[ColorMapInfo]: + """ + Dispatch to :func:`read_continuous_color_map` or :func:`read_discrete_color_map` + based on the runtime type of *color_map_obj*. + + :param color_map_obj: Any RESQML color-map object (``ContinuousColorMap`` + or ``DiscreteColorMap`` from any EML/RESQML version). + :return: :class:`ColorMapInfo`, or ``None`` if the type is unrecognised. + """ + type_name = type(color_map_obj).__name__.lower() + if "continuous" in type_name: + return read_continuous_color_map(color_map_obj) + if "discrete" in type_name: + return read_discrete_color_map(color_map_obj) + logging.warning(f"read_color_map: unsupported color-map type '{type(color_map_obj).__name__}'") + return None + + +# ───────────────────────────────────────────────────────────────────────────── +# Main entry point (Group 2) +# ───────────────────────────────────────────────────────────────────────────── + + +def read_graphical_rendering_info( + graphical_information_set: Any, + target_uuid: str, + workspace: Optional[EnergymlStorageInterface] = None, +) -> Optional[ScalarRenderingInfo]: + """ + Extract all rendering parameters for a target object from a + ``GraphicalInformationSet``. + + **Input**: + + - *graphical_information_set*: a RESQML/EML ``GraphicalInformationSet`` + object (from ``workspace.get_object(uri)`` or similar). + - *target_uuid*: the UUID (string) of the property, representation, + feature or interpretation you want to render. + - *workspace*: an :class:`EnergymlStorageInterface` used to resolve the + ``ColorMap`` DOR inside ``ColorInformation``. Pass ``None`` if the + color map is not needed. + + **Output**: :class:`ScalarRenderingInfo`, or ``None`` if the GIS contains + no graphical information targeting *target_uuid*. + + Covers all standard RESQML v2.2 ``AbstractGraphicalInformation`` subtypes: + + +-------------------------------+-----------------------------------+ + | RESQML class | Populated fields | + +===============================+===================================+ + | ``ColorInformation`` | ``color_map``, ``color_min_max``, | + | | ``color_use_log``, ``color_use_`` | + | | ``reverse``, | + | | ``color_value_vector_index`` | + +-------------------------------+-----------------------------------+ + | ``AlphaInformation`` | ``alpha_control_points``, | + | | ``alpha_min_max``, | + | | ``alpha_use_log``, | + | | ``alpha_overwrite_color_alpha`` | + +-------------------------------+-----------------------------------+ + | ``SizeInformation`` | ``size_min_max``, | + | | ``size_use_log``, | + | | ``size_value_vector_index`` | + +-------------------------------+-----------------------------------+ + | ``DefaultGraphicalInform…`` | ``is_visible``, | + | | ``constant_color``, | + | | ``constant_alpha`` | + +-------------------------------+-----------------------------------+ + | ``ContourLineSetInform…`` | ``contour_increment``, | + | | ``contour_show_major_every`` | + +-------------------------------+-----------------------------------+ + """ + + result = ScalarRenderingInfo(target_obj_uuid=target_uuid) + found = False + + gis_infos: List[Any] = getattr(graphical_information_set, "graphical_information", []) or [] + + for info in gis_infos: + # Each AbstractGraphicalInformation targets ≥1 objects via target_object[]. + targets: List[Any] = getattr(info, "target_object", []) or [] + if not any(get_obj_uuid(t) == target_uuid for t in targets): + continue + found = True + + type_name = type(info).__name__ + + if "ColorInformation" in type_name: + result.color_use_log = bool(getattr(info, "use_logarithmic_mapping", False)) + result.color_use_reverse = bool(getattr(info, "use_reverse_mapping", False)) + result.color_value_vector_index = getattr(info, "value_vector_index", None) + mm = getattr(info, "min_max", None) + if mm is not None: + result.color_min_max = (mm.minimum, mm.maximum) + cmap_dor = getattr(info, "color_map", None) + if cmap_dor is not None and workspace is not None: + cmap_obj = workspace.get_object(get_obj_uri(cmap_dor)) + if cmap_obj is None: + candidates = workspace.get_object_by_uuid(get_obj_uuid(cmap_dor)) + cmap_obj = candidates[0] if candidates else None + if cmap_obj is not None: + result.color_map = read_color_map(cmap_obj) + + elif "AlphaInformation" in type_name: + result.alpha_use_log = bool(getattr(info, "use_logarithmic_mapping", False)) + result.alpha_overwrite_color_alpha = bool(getattr(info, "overwrite_color_alpha", False)) + mm = getattr(info, "min_max", None) + if mm is not None: + result.alpha_min_max = (mm.minimum, mm.maximum) + raw_indices = getattr(info, "index", []) or [] + raw_alphas = getattr(info, "alpha", []) or [] + if raw_indices and raw_alphas: + try: + result.alpha_control_points = [ + (float(idx), float(a)) + for idx, a in zip(raw_indices, raw_alphas) + ] + except (TypeError, ValueError) as exc: + logging.warning(f"read_graphical_rendering_info: cannot parse AlphaInformation indices: {exc}") + + elif "SizeInformation" in type_name: + result.size_use_log = bool(getattr(info, "use_logarithmic_mapping", False)) + result.size_value_vector_index = getattr(info, "value_vector_index", None) + mm = getattr(info, "min_max", None) + if mm is not None: + result.size_min_max = (mm.minimum, mm.maximum) + + elif "DefaultGraphicalInformation" in type_name: + for elem_info in (getattr(info, "indexable_element_info", []) or []): + if (getattr(elem_info, "is_visible", None)) is False: + result.is_visible = False + const_col = getattr(elem_info, "constant_color", None) + if const_col is not None: + result.constant_color = RgbaColor.from_hsv(const_col) + const_alpha = getattr(elem_info, "constant_alpha", None) + if const_alpha is not None: + result.constant_alpha = float(const_alpha) + + elif "ContourLineSetInformation" in type_name: + result.contour_increment = getattr(info, "increment", None) + result.contour_show_major_every = getattr(info, "show_major_line_every", None) + + # AnnotationInformation is intentionally not mapped to ScalarRenderingInfo + # because it drives label text, not colour/size - handle separately if needed. + + return result if found else None \ No newline at end of file diff --git a/energyml-utils/src/energyml/utils/data/mesh.py b/energyml-utils/src/energyml/utils/data/mesh.py index 108da7e..cdb2b04 100644 --- a/energyml-utils/src/energyml/utils/data/mesh.py +++ b/energyml-utils/src/energyml/utils/data/mesh.py @@ -6,6 +6,7 @@ import os import re import sys +import traceback import numpy as np from dataclasses import dataclass, field from enum import Enum @@ -13,14 +14,20 @@ from typing import List, Optional, Any, Callable, Dict, Union, Tuple -from .helper import ( +from energyml.utils.data.helper import ( + apply_crs_transform, + generate_vertical_well_points, + get_crs_offsets_and_angle, + get_datum_information, + get_wellbore_points, + hermite_interpolation, read_array, read_grid2d_patch, get_crs_obj, - get_crs_origin_offset, - is_z_reversed, + read_parametric_geometry, ) -from energyml.utils.epc import gen_energyml_object_path +from energyml.utils.data.crs import extract_crs_info, apply_from_crs_info +from energyml.utils.epc_utils import gen_energyml_object_path from energyml.utils.epc_stream import EpcStreamReader from energyml.utils.exception import NotSupportedError, ObjectNotFoundNotError from energyml.utils.introspection import ( @@ -35,7 +42,7 @@ # Import export functions from new export module for backward compatibility -from .export import export_obj as _export_obj_new +from energyml.utils.data.export import export_obj as _export_obj_new _FILE_HEADER: bytes = b"# file exported by energyml-utils python module (Geosiris)\n" @@ -154,27 +161,7 @@ def get_indices(self) -> Union[List[List[int]], np.ndarray]: return self.faces_indices -def crs_displacement(points: List[Point], crs_obj: Any) -> Tuple[List[Point], Point]: - """ - Transform a point list with CRS information (XYZ offset and ZIncreasingDownward) - :param points: in/out : the list is directly modified - :param crs_obj: - :return: The translated points and the crs offset vector. - """ - crs_point_offset = get_crs_origin_offset(crs_obj=crs_obj) - zincreasing_downward = is_z_reversed(crs_obj) - - if crs_point_offset != [0, 0, 0]: - for p in points: - for xyz in range(len(p)): - p[xyz] = (p[xyz] + crs_point_offset[xyz]) if p[xyz] is not None else None - if zincreasing_downward and len(p) >= 3: - p[2] = -p[2] - - return points, crs_point_offset - - -def get_mesh_reader_function(mesh_type_name: str) -> Optional[Callable]: +def get_object_reader_function(mesh_type_name: str) -> Optional[Callable]: """ Returns the name of the potential appropriate function to read an object with type is named mesh_type_name :param mesh_type_name: the initial type name @@ -186,6 +173,11 @@ def get_mesh_reader_function(mesh_type_name: str) -> Optional[Callable]: return None +def get_mesh_reader_function(mesh_type_name: str) -> Optional[Callable]: + """@deprecated use get_object_reader_function instead""" + return get_object_reader_function(mesh_type_name) + + def _mesh_name_mapping(array_type_name: str) -> str: """ Transform the type name to match existing reader function @@ -201,15 +193,15 @@ def _mesh_name_mapping(array_type_name: str) -> str: def read_mesh_object( energyml_object: Any, workspace: Optional[EnergymlStorageInterface] = None, - use_crs_displacement: bool = False, + use_crs_displacement: bool = True, sub_indices: Optional[Union[List[int], np.ndarray]] = None, ) -> List[AbstractMesh]: """ Read and "meshable" object. If :param:`energyml_object` is not supported, an exception will be raised. :param energyml_object: :param workspace: - :param use_crs_displacement: If true the :py:function:`crs_displacement ` - is used to translate the data with the CRS offsets + :param use_crs_displacement: If true :func:`apply_from_crs_info` is used to apply the full CRS + transform (rotation, offsets, Z-flip, axis-order swap) to the mesh points :return: """ @@ -217,19 +209,33 @@ def read_mesh_object( return energyml_object array_type_name = _mesh_name_mapping(type(energyml_object).__name__) - reader_func = get_mesh_reader_function(array_type_name) + reader_func = get_object_reader_function(array_type_name) if reader_func is not None: # logging.info(f"using function {reader_func} to read type {array_type_name}") surfaces: List[AbstractMesh] = reader_func( - energyml_object=energyml_object, workspace=workspace, sub_indices=sub_indices + energyml_object=energyml_object, + workspace=workspace, + sub_indices=sub_indices, + use_crs_displacement=use_crs_displacement, ) + _tn = array_type_name.lower() if ( - use_crs_displacement and "wellbore" not in array_type_name.lower() - ): # WellboreFrameRep has allready the displacement applied - # TODO: the displacement should be done in each reader function to manage specific cases + use_crs_displacement + and "wellbore" not in _tn + and "triangulated" not in _tn # per-patch CRS applied inside reader + and "point" not in _tn # per-patch CRS applied inside reader + and "polyline" not in _tn # per-patch CRS applied inside reader + and "representationset" not in _tn # each sub-mesh already had CRS applied by its own reader + and "subrepresentation" not in _tn # delegates entirely to inner read_mesh_object call + ): for s in surfaces: - print("CRS : ", s.crs_object.uuid if s.crs_object is not None else "None") - crs_displacement(s.point_list, s.crs_object) + crs = s.crs_object[0] if isinstance(s.crs_object, list) and s.crs_object else s.crs_object + if crs is None: + continue + logging.debug(f"Applying CRS transform to surface {s.identifier}") + pts_arr = np.asarray(s.point_list, dtype=np.float64).reshape(-1, 3) + apply_from_crs_info(pts_arr, extract_crs_info(crs, workspace), inplace=True) + s.point_list = pts_arr.tolist() return surfaces else: # logging.error(f"Type {array_type_name} is not supported: function read_{snake_case(array_type_name)} not found") @@ -241,6 +247,7 @@ def read_mesh_object( def read_ijk_grid_representation( energyml_object: Any, workspace: EnergymlStorageInterface, + use_crs_displacement: bool = True, sub_indices: Optional[Union[List[int], np.ndarray]] = None, ) -> List[Any]: raise NotSupportedError("IJKGrid representation reading is not supported yet.") @@ -249,6 +256,7 @@ def read_ijk_grid_representation( def read_point_representation( energyml_object: Any, workspace: EnergymlStorageInterface, + use_crs_displacement: bool = True, sub_indices: Optional[Union[List[int], np.ndarray]] = None, ) -> List[PointSetMesh]: # pt_geoms = search_attribute_matching_type(point_set, "AbstractGeometry") @@ -257,14 +265,16 @@ def read_point_representation( patch_idx = 0 total_size = 0 - for ( - points_path_in_obj, - points_obj, - ) in search_attribute_matching_name_with_path( + + patches_geom = search_attribute_matching_name_with_path( energyml_object, r"NodePatch.[\d]+.Geometry.Points" ) + search_attribute_matching_name_with_path( # resqml 2.0.1 energyml_object, r"NodePatchGeometry.[\d]+.Points" - ): # resqml 2.2 + ) + # logging.debug(f"Found {len(patches_geom)} patches for point representation") + # logging.debug(f"\t=> {patches_geom}") + + for points_path_in_obj, points_obj in patches_geom: points = read_array( energyml_array=points_obj, root_obj=energyml_object, @@ -295,6 +305,13 @@ def read_point_representation( # else: # total_size = total_size + len(points) + # Apply full CRS transform per patch; crs_object kept on mesh for reference + # but the outer dispatcher is guarded to skip crs_displacement for this type. + if use_crs_displacement and crs is not None and points is not None and len(points) > 0: + pts_arr = np.asarray(points, dtype=np.float64).reshape(-1, 3) + apply_from_crs_info(pts_arr, extract_crs_info(crs, workspace), inplace=True) + points = pts_arr.tolist() + if points is not None: meshes.append( PointSetMesh( @@ -313,6 +330,7 @@ def read_point_representation( def read_polyline_representation( energyml_object: Any, workspace: EnergymlStorageInterface, + use_crs_displacement: bool = True, sub_indices: Optional[Union[List[int], np.ndarray]] = None, ) -> List[PolylineSetMesh]: # pt_geoms = search_attribute_matching_type(point_set, "AbstractGeometry") @@ -324,7 +342,18 @@ def read_polyline_representation( for patch_path_in_obj, patch in search_attribute_matching_name_with_path( energyml_object, "NodePatch" ) + search_attribute_matching_name_with_path(energyml_object, r"LinePatch.[\d]+"): - points_path, points_obj = search_attribute_matching_name_with_path(patch, "Geometry.Points")[0] + + pts = search_attribute_matching_name_with_path(patch, "Geometry.Points") + if pts is None or len(pts) == 0: + pts = search_attribute_matching_name_with_path(patch, "Points") + + try: + points_path, points_obj = pts[0] + except Exception as e: + logging.error(f"Cannot find points for patch {patch_path_in_obj} : {e}") + logging.error(patch) + raise e + points = read_array( energyml_array=points_obj, root_obj=energyml_object, @@ -345,10 +374,7 @@ def read_polyline_representation( close_poly = None try: - ( - close_poly_path, - close_poly_obj, - ) = search_attribute_matching_name_with_path( + (close_poly_path, close_poly_obj,) = search_attribute_matching_name_with_path( patch, "ClosedPolylines" )[0] close_poly = read_array( @@ -362,10 +388,7 @@ def read_polyline_representation( point_indices = [] try: - ( - node_count_per_poly_path_in_obj, - node_count_per_poly, - ) = search_attribute_matching_name_with_path( + (node_count_per_poly_path_in_obj, node_count_per_poly,) = search_attribute_matching_name_with_path( patch, "NodeCountPerPolyline" )[0] node_counts_list = read_array( @@ -401,6 +424,13 @@ def read_polyline_representation( else: total_size = total_size + len(point_indices) + # Apply full CRS transform per patch; crs_object kept on mesh for reference + # but the outer dispatcher is guarded to skip crs_displacement for this type. + if use_crs_displacement and crs is not None and len(points) > 0: + pts_arr = np.asarray(points, dtype=np.float64).reshape(-1, 3) + apply_from_crs_info(pts_arr, extract_crs_info(crs, workspace), inplace=True) + points = pts_arr.tolist() + if len(points) > 0: meshes.append( PolylineSetMesh( @@ -522,7 +552,8 @@ def gen_surface_grid_geometry( def read_grid2d_representation( energyml_object: Any, workspace: Optional[EnergymlStorageInterface] = None, - keep_holes=False, + use_crs_displacement: bool = True, + keep_holes: bool = False, sub_indices: Optional[Union[List[int], np.ndarray]] = None, ) -> List[SurfaceMesh]: # h5_reader = HDF5FileReader() @@ -536,6 +567,8 @@ def read_grid2d_representation( # Resqml 201 for patch_path, patch in search_attribute_matching_name_with_path(energyml_object, "Grid2dPatch"): + logging.debug("Trying to read Grid2d representation with Resqml 2.0.1 schema (Grid2dPatch)") + logging.debug(f" > {get_obj_uri(energyml_object)}Found patch at path {patch_path} with object {patch}") crs = None try: crs = get_crs_obj( @@ -572,6 +605,9 @@ def read_grid2d_representation( # Resqml 22 if hasattr(energyml_object, "geometry"): + logging.debug( + "Trying to read Grid2d representation with Resqml 2.2 schema (geometry attribute on the representation)" + ) crs = None try: crs = get_crs_obj( @@ -614,6 +650,7 @@ def read_grid2d_representation( def read_triangulated_set_representation( energyml_object: Any, workspace: EnergymlStorageInterface, + use_crs_displacement: bool = True, sub_indices: Optional[Union[List[int], np.ndarray]] = None, ) -> List[SurfaceMesh]: meshes = [] @@ -621,12 +658,16 @@ def read_triangulated_set_representation( point_offset = 0 patch_idx = 0 total_size = 0 - for patch_path, patch in search_attribute_matching_name_with_path( + + patches = search_attribute_matching_name_with_path( energyml_object, - "\\.*Patch.\\d+", + "\\w*Patch.\\d+", deep_search=False, search_in_sub_obj=False, - ): + ) + # logging.debug(f"Found {len(patches)} patches for triangulated set representation") + + for patch_path, patch in patches: crs = None try: crs = get_crs_obj( @@ -651,6 +692,20 @@ def read_triangulated_set_representation( point_list = point_list + _array + # Apply full CRS transform (rotation + offsets + z-flip + axis-swap) per patch. + # Setting crs_object=None on the resulting mesh prevents the outer + # read_mesh_object dispatcher from calling crs_displacement() a second time. + logging.debug( + f"Applying use_crs_displacement {use_crs_displacement} with crs {crs} on patch {patch_path} with {len(point_list)} points for triangulated set representation {get_obj_uri(energyml_object)}" + ) + if use_crs_displacement and crs is not None and point_list: + logging.debug(f"Original points sample: {point_list[0:5]}") + pts_arr = np.asarray(point_list, dtype=np.float64).reshape(-1, 3) + crs_info = extract_crs_info(crs, workspace) + apply_from_crs_info(pts_arr, crs_info, inplace=True) + logging.debug(f"Transformed points sample: {pts_arr[0:5]}") + point_list = pts_arr.tolist() + triangles_list: List[List[int]] = [] for ( triangles_path, @@ -695,6 +750,7 @@ def read_triangulated_set_representation( def read_wellbore_frame_representation( energyml_object: Any, workspace: EnergymlStorageInterface, + use_crs_displacement: bool = True, sub_indices: Optional[Union[List[int], np.ndarray]] = None, ) -> List[PolylineSetMesh]: """ @@ -705,144 +761,204 @@ def read_wellbore_frame_representation( :param sub_indices: Optional list of indices to filter specific nodes :return: List containing a single PolylineSetMesh representing the wellbore """ + meshes = [] try: # Read measured depths (NodeMd) - md_array = [] + wellbore_frame_mds = None try: node_md_path, node_md_obj = search_attribute_matching_name_with_path(energyml_object, "NodeMd")[0] - md_array = read_array( + wellbore_frame_mds = read_array( energyml_array=node_md_obj, root_obj=energyml_object, path_in_root=node_md_path, workspace=workspace, ) - if not isinstance(md_array, list): - md_array = md_array.tolist() if hasattr(md_array, "tolist") else list(md_array) + # Ensure wellbore_frame_mds is a numpy array for filtering operations + if not isinstance(wellbore_frame_mds, np.ndarray): + wellbore_frame_mds = np.array(wellbore_frame_mds) except (IndexError, AttributeError) as e: logging.warning(f"Could not read NodeMd from wellbore frame: {e}") return meshes - # Get trajectory reference - trajectory_dor = search_attribute_matching_name(obj=energyml_object, name_rgx="Trajectory")[0] - trajectory_identifier = get_obj_uri(trajectory_dor) - trajectory_obj = workspace.get_object(trajectory_identifier) - - if trajectory_obj is None: - logging.error(f"Trajectory {trajectory_identifier} not found") - return meshes - - # CRS - crs = None - # Get reference point (wellhead location) - try different attribute paths for different versions - head_x, head_y, head_z = 0.0, 0.0, 0.0 - z_is_up = True # Default assumption + md_min = np.min(wellbore_frame_mds) if len(wellbore_frame_mds) > 0 else 0.0 + md_max = np.max(wellbore_frame_mds) if len(wellbore_frame_mds) > 0 else 0.0 try: - # Try to get MdDatum (RESQML 2.0.1) or MdInterval.Datum (RESQML 2.2+) - md_datum_dor = None - try: - md_datum_dor = search_attribute_matching_name(obj=trajectory_obj, name_rgx=r"MdDatum")[0] - except IndexError: - try: - md_datum_dor = search_attribute_matching_name(obj=trajectory_obj, name_rgx=r"MdInterval.Datum")[0] - except IndexError: - pass - - if md_datum_dor is not None: - md_datum_identifier = get_obj_uri(md_datum_dor) - md_datum_obj = workspace.get_object(md_datum_identifier) - - if md_datum_obj is not None: - # Try to get coordinates from ReferencePointInACrs - try: - head_x = get_object_attribute_rgx(md_datum_obj, r"HorizontalCoordinates.Coordinate1") or 0.0 - head_y = get_object_attribute_rgx(md_datum_obj, r"HorizontalCoordinates.Coordinate2") or 0.0 - head_z = get_object_attribute_rgx(md_datum_obj, "VerticalCoordinate") or 0.0 - - # Get vertical CRS to determine z direction - try: - vcrs_dor = search_attribute_matching_name(obj=md_datum_obj, name_rgx="VerticalCrs")[0] - vcrs_identifier = get_obj_uri(vcrs_dor) - vcrs_obj = workspace.get_object(vcrs_identifier) - - if vcrs_obj is not None: - z_is_up = not is_z_reversed(vcrs_obj) - except (IndexError, AttributeError): - pass - except AttributeError: - pass - # Get CRS from trajectory geometry if available - try: - geometry_paths = search_attribute_matching_name_with_path(md_datum_obj, r"VerticalCrs") - if len(geometry_paths) > 0: - crs_dor_path, crs_dor = geometry_paths[0] - crs_identifier = get_obj_uri(crs_dor) - crs = workspace.get_object(crs_identifier) - except Exception as e: - logging.debug(f"Could not get CRS from trajectory: {e}") - except Exception as e: - logging.debug(f"Could not get reference point from trajectory: {e}") + # Only works for RESQML 2.2+ + _md_min = get_object_attribute(energyml_object, "md_interval.md_min") + if _md_min is not None: + md_min = _md_min + _md_max = get_object_attribute(energyml_object, "md_interval.md_max") + if _md_max is not None: + md_max = _md_max + except AttributeError: + # logging.debug( + # "Could not get md_interval.md_min or md_interval.md_max, using NodeMd min/max instead" + # ) + pass - # Build wellbore path points - simple vertical projection from measured depths - # Note: This is a simplified representation. For accurate 3D trajectory, - # you would need to interpolate along the trajectory's control points. - points = [] - line_indices = [] + # remove md values from array if outside of md_min/md_max range (can happen if md_interval is used and NodeMd contains values outside of the interval) + wellbore_frame_mds = wellbore_frame_mds[(wellbore_frame_mds >= md_min) & (wellbore_frame_mds <= md_max)] - for i, md in enumerate(md_array): - # Create point at (head_x, head_y, head_z +/- md) - # Apply z direction based on CRS - z_offset = md if z_is_up else -md - points.append([head_x, head_y, head_z + z_offset]) + # Get trajectory reference + trajectory_dor = search_attribute_matching_name(obj=energyml_object, name_rgx="Trajectory")[0] + trajectory_obj = workspace.get_object(get_obj_uri(trajectory_dor)) - # Connect consecutive points - if i > 0: - line_indices.append([i - 1, i]) + # print(f"Mds {wellbore_frame_mds}") - # Apply sub_indices filter if provided - if sub_indices is not None and len(sub_indices) > 0: - filtered_points = [] - filtered_indices = [] - index_map = {} + meshes = read_wellbore_trajectory_representation( + energyml_object=trajectory_obj, + workspace=workspace, + use_crs_displacement=use_crs_displacement, + sub_indices=sub_indices, + wellbore_frame_mds=wellbore_frame_mds, + ) + for mesh in meshes: + mesh.identifier = f"{get_obj_uri(energyml_object)}" + return meshes + except Exception as e: + logging.error(f"Failed to read wellbore frame representation: {e}") + import traceback - for new_idx, old_idx in enumerate(sub_indices): - if 0 <= old_idx < len(points): - filtered_points.append(points[old_idx]) - index_map[old_idx] = new_idx + traceback.print_exc() - for line in line_indices: - if line[0] in index_map and line[1] in index_map: - filtered_indices.append([index_map[line[0]], index_map[line[1]]]) + return meshes - points = filtered_points - line_indices = filtered_indices - if len(points) > 0: - meshes.append( - PolylineSetMesh( - identifier=f"{get_obj_uri(energyml_object)}_wellbore", - energyml_object=energyml_object, - crs_object=crs, - point_list=points, - line_indices=line_indices, - ) +def read_wellbore_trajectory_representation( + energyml_object: Any, + workspace: EnergymlStorageInterface, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, + wellbore_frame_mds: Optional[Union[List[float], np.ndarray]] = None, + step_meter: float = 5.0, +) -> List[PolylineSetMesh]: + if energyml_object is None: + return [] + + if isinstance(energyml_object, list): + return [ + mesh + for obj in energyml_object + for mesh in read_wellbore_trajectory_representation( + obj, workspace, use_crs_displacement, sub_indices, wellbore_frame_mds, step_meter ) + ] + + # CRS + crs = None + head_x, head_y, head_z, z_increasing_downward, projected_epsg_code, vertical_epsg_code = ( + 0.0, + 0.0, + 0.0, + False, + None, + None, + ) + # Get CRS from trajectory geometry if available + try: + crs_attr = get_object_attribute(energyml_object, "geometry.LocalCrs") + if crs_attr is not None: + crs = workspace.get_object(get_obj_uri(crs_attr)) + else: + raise ObjectNotFoundNotError("LocalCrs attribute not found in trajectory geometry") + except Exception: + logging.debug("Could not get CRS from trajectory geometry") + + # ========== + # MD Datum + # ========== + try: + # Try to get MdDatum (RESQML 2.0.1) or MdInterval.Datum (RESQML 2.2+) + md_datum_dor = None + try: + md_datum_dor = search_attribute_matching_name(obj=energyml_object, name_rgx=r"MdDatum")[0] + except IndexError: + try: + md_datum_dor = search_attribute_matching_name(obj=energyml_object, name_rgx=r"MdInterval.Datum")[0] + except IndexError: + pass + + if md_datum_dor is not None: + md_datum_identifier = get_obj_uri(md_datum_dor) + md_datum_obj = workspace.get_object(md_datum_identifier) + + if md_datum_obj is not None: + ( + head_x, + head_y, + head_z, + z_increasing_downward, + projected_epsg_code, + vertical_epsg_code, + crs, + ) = get_datum_information(md_datum_obj, workspace) + # if crs is None: + # crs = get_crs_obj( + # context_obj=md_datum_obj, + # path_in_root=".", + # root_obj=energyml_object, + # workspace=workspace, + # ) except Exception as e: - logging.error(f"Failed to read wellbore frame representation: {e}") - import traceback + logging.debug(f"Could not get reference point / Datum from trajectory: {e}") - traceback.print_exc() + # ========== + well_points = None + logging.debug( + f"wellbore mds : {wellbore_frame_mds}\n\tCRs : {crs}\n\thead x,y,z : {head_x}, {head_y}, {head_z}\n\tz increasing downward : {z_increasing_downward}" + ) + try: + crs_info = extract_crs_info(crs, workspace) + # Try to read parametric Geometry from the trajectory. + traj_mds, traj_points, traj_tangents = read_parametric_geometry( + getattr(energyml_object, "geometry", None), workspace + ) + well_points = get_wellbore_points(wellbore_frame_mds, traj_mds, traj_points, traj_tangents, step_meter) + if use_crs_displacement: + well_points = apply_from_crs_info( + np.asarray(well_points, dtype=np.float64), + crs_info, + ) + except Exception as e: + if wellbore_frame_mds is not None: + logging.debug(f"Could not read parametric geometry from trajectory. Well is interpreted as vertical: {e}") + well_points = generate_vertical_well_points( + head_x=head_x, + head_y=head_y, + head_z=head_z, + wellbore_mds=wellbore_frame_mds, + z_increasing_downward=z_increasing_downward, + ) + else: + traceback.print_exc() + raise ValueError( + "Cannot read wellbore trajectory representation: no parametric geometry and no measured depth information available to generate points" + ) + meshes = [] + if well_points is not None and len(well_points) > 0: + + meshes.append( + PolylineSetMesh( + identifier=f"{get_obj_uri(energyml_object)}", + energyml_object=energyml_object, + crs_object=crs, + point_list=well_points, + line_indices=[[i, i + 1] for i in range(len(well_points) - 1)], + ) + ) return meshes def read_sub_representation( energyml_object: Any, workspace: EnergymlStorageInterface, + use_crs_displacement: bool = True, sub_indices: Optional[Union[List[int], np.ndarray]] = None, ) -> List[AbstractMesh]: supporting_rep_dor = search_attribute_matching_name( @@ -887,6 +1003,7 @@ def read_sub_representation( meshes = read_mesh_object( energyml_object=supporting_rep, workspace=workspace, + use_crs_displacement=use_crs_displacement, sub_indices=all_indices, ) @@ -896,7 +1013,343 @@ def read_sub_representation( return meshes -# MESH FILES +def read_representation_set_representation( + energyml_object: Any, + workspace: EnergymlStorageInterface, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, +) -> List[AbstractMesh]: + + repr_list = get_object_attribute(energyml_object, "representation") + if repr_list is None or not isinstance(repr_list, list): + logging.error( + f"RepresentationSetRepresentation {get_obj_uri(energyml_object)} has no 'representation' list attribute" + ) + return [] + + meshes = [] + for repr_dor in repr_list: + rpr_uri = get_obj_uri(repr_dor) + repr_obj = workspace.get_object(rpr_uri) + if repr_obj is None: + logging.error(f"Representation {rpr_uri} in RepresentationSetRepresentation not found") + continue + meshes.extend( + read_mesh_object(energyml_object=repr_obj, workspace=workspace, use_crs_displacement=use_crs_displacement) + ) + + return meshes + + +def read_property( + energyml_object: Any, + workspace: EnergymlStorageInterface, +) -> np.ndarray: + """ + Read a property or column-based table from an Energyml object. + + Dispatches to the appropriate reader function based on the object's type name. + If no specific reader is found, raises a NotSupportedError. + + Args: + energyml_object: The Energyml object to read from. + workspace: The storage interface for accessing related objects. + + Returns: + np.ndarray: The read property or table data. + + Raises: + NotSupportedError: If the object type is not supported. + """ + property_type = type(energyml_object).__name__ + reader_func = get_object_reader_function(property_type) + if reader_func is not None: + return reader_func(energyml_object=energyml_object, workspace=workspace) + else: + # logging.error(f"Type {array_type_name} is not supported: function read_{snake_case(array_type_name)} not found") + raise NotSupportedError( + f"Type {property_type} is not supported\n\tfunction read_{snake_case(property_type)} not found" + ) + + +def read_property_interpreted_with_cbt( + energyml_object: Any, + workspace: EnergymlStorageInterface, + _cache_property_arrays: Optional[np.ndarray] = None, + _return_none_if_no_category_lookup: bool = False, +) -> Optional[np.ndarray]: + """ + Read a property with category lookup interpretation. + + Reads property arrays and applies category lookup mapping if available. + Supports both array and dictionary-based category lookups. + + Args: + energyml_object: The Energyml property object. + workspace: The storage interface for accessing related objects. + _cache_property_arrays: Optional cached property arrays to avoid re-reading. + _return_none_if_no_category_lookup: If True, return None when no category lookup is found. + + Returns: + Optional[np.ndarray]: The interpreted property values, or None if no lookup and flag is set. + """ + + result = None + + prop_arrays = ( + read_property(energyml_object, workspace) if _cache_property_arrays is None else _cache_property_arrays + ) + + category_lookup_dor = get_object_attribute(energyml_object, "category_lookup") + if category_lookup_dor is not None: + category_lookup_obj = workspace.get_object(get_obj_uri(category_lookup_dor)) + if category_lookup_obj is not None: + category_lookup_data = read_column_based_table(category_lookup_obj, workspace) + + # print(f"category_lookup_array : {category_lookup_data}") + if isinstance(category_lookup_data, list): + category_lookup_data = np.array(category_lookup_data) + if isinstance(category_lookup_data, np.ndarray): + # map props values to category lookup values using prop value as index in category lookup array + result = ( + np.array( + [ + ( + category_lookup_data[prop] + if prop is not None and prop < len(category_lookup_data) + else None + ) + for prop in prop_arrays + ] + ) + if prop_arrays is not None + else None + ) + elif isinstance(category_lookup_data, dict): + # Transpose so that each index corresponds to a category (column), not a row + category_lookup_matrice = np.array(list(category_lookup_data.values())).T + # logging.debug(f"category_lookup_matrice : {category_lookup_matrice}") + # return a matrice with the same shape as prop_arrays but with the values from the category lookup array using the prop value as key in the category lookup array + result = ( + np.array( + [ + [ + ( + category_lookup_matrice[prop].tolist() + if prop is not None and 0 <= prop < len(category_lookup_matrice) + else None + ) + for prop in prop_row + ] + for prop_row in prop_arrays + ] + ) + if prop_arrays is not None + else None + ) + else: + raise NotSupportedError( + f"Category lookup array type {type(category_lookup_matrice)} is not supported, expected list or dict" + ) + + return prop_arrays if result is None and not _return_none_if_no_category_lookup else result + + +def read_abstract_values_property( + energyml_object: Any, + workspace: EnergymlStorageInterface, +) -> np.ndarray: + """ + Read abstract values property from patches. + + Extracts and concatenates arrays from all 'values_for_patch' attributes. + + Args: + energyml_object: The Energyml object containing the property. + workspace: The storage interface for accessing arrays. + + Returns: + np.ndarray: The concatenated array of property values. + """ + arrays = [] + for values_for_patch in search_attribute_matching_name_with_path(energyml_object, "values_for_patch"): + array = read_array( + energyml_array=values_for_patch[1], + root_obj=energyml_object, + path_in_root=".", + workspace=workspace, + ) + if isinstance(array, list): + array = np.array(array) + arrays.append(array) + if len(arrays) == 1: + return arrays[0] + else: + return np.concatenate(arrays) + + +def read_discrete_property( + energyml_object: Any, + workspace: EnergymlStorageInterface, +) -> np.ndarray: + """ + Read a discrete property. + + Delegates to read_abstract_values_property for implementation. + + Args: + energyml_object: The discrete property object. + workspace: The storage interface. + + Returns: + np.ndarray: The property values. + """ + + return read_abstract_values_property(energyml_object, workspace) + + +def read_continuous_property( + energyml_object: Any, + workspace: EnergymlStorageInterface, +) -> np.ndarray: + """ + Read a continuous property. + + Delegates to read_abstract_values_property for implementation. + + Args: + energyml_object: The continuous property object. + workspace: The storage interface. + + Returns: + np.ndarray: The property values. + """ + + return read_abstract_values_property(energyml_object, workspace) + + +def read_categorical_property( + energyml_object: Any, + workspace: EnergymlStorageInterface, +) -> np.ndarray: + """ + Read a categorical property. + + Note: Categorical values are returned as integers. Use the property's + 'code_list' attribute to map to string values. + + Args: + energyml_object: The categorical property object. + workspace: The storage interface. + + Returns: + np.ndarray: The integer-coded property values. + """ + # TODO: the categorical values should be converted to strings using the code list of the property, but for now we keep the integer values and let the user manage the conversion if needed. + logging.warning( + "CategoricalProperty is read as a continuous property, the categorical values are not converted to strings but kept as integers. Use the 'code_list' attribute of the property to get the list of possible string values corresponding to the integer values in the array" + ) + return read_abstract_values_property(energyml_object, workspace) + + +def read_comment_property( + energyml_object: Any, + workspace: EnergymlStorageInterface, +) -> np.ndarray: + """ + Read a comment property. + + Delegates to read_abstract_values_property for implementation. + + Args: + energyml_object: The comment property object. + workspace: The storage interface. + + Returns: + np.ndarray: The comment values. + """ + return read_abstract_values_property(energyml_object, workspace) + + +def read_column_based_table( + energyml_object: Any, + workspace: EnergymlStorageInterface, +) -> Dict[str, np.ndarray]: + """ + Read a column-based table. + + Extracts column data into a dictionary keyed by column titles. + + Args: + energyml_object: The table object with 'column' attributes. + workspace: The storage interface for accessing arrays. + + Returns: + Dict[str, np.ndarray]: Dictionary of column names to arrays. + """ + columns = {} + for column in get_object_attribute(energyml_object, "column"): + column_name = getattr(column, "title", "_") + # print(f"Reading column: {column_name} : {column}") + # print(f"getattr(column_array, 'values', None): {getattr(column, 'values', None)}") + array = read_array( + energyml_array=getattr(column, "values", None), + root_obj=energyml_object, + path_in_root=".", + workspace=workspace, + ) + if isinstance(array, list): + array = np.array(array) + columns[column_name] = array + return columns + + +def read_time_series( + energyml_object: Any, + workspace: EnergymlStorageInterface, +) -> List[Tuple[str, int]]: + """ + Read a time series from an Energyml object. + + Extracts date-time values and time step indices, constructing a normalized + list of (step_index, datetime) tuples for each time step. + + Args: + energyml_object: The Energyml time series object. + workspace: The storage interface for accessing related objects. + + Returns: + List[Tuple[str, int]]: List of tuples containing (step_index, datetime_string). + """ + + # 1. Extraction des DateTime + times_iso = search_attribute_matching_name(energyml_object, "date_time") + + # 2. Extraction des TimeSteps (v2.2+) + steps_indices = [] + time_step_obj = get_object_attribute(energyml_object, "time_step") + if time_step_obj is not None: + steps_indices = read_array(time_step_obj, energyml_object, ".", workspace, sub_indices=None) + else: + # Fallback : on utilise l'index de la liste + steps_indices = list(range(len(times_iso))) + + # 3. Construction de la structure normalisée + steps_data = [] + for i in range(len(times_iso)): + steps_data.append( + (steps_indices[i], times_iso[i]) + # {"index": i, "datetime": times_iso[i], "step_val": steps_indices[i]} # L'index utilisé par les propriétés + ) + + return steps_data + + +# __ ______________ __ __ _____ __ ____ __ +# / |/ / ____/ ___// / / / / __(_) /__ _____ / __/___ _________ ___ ____ _/ /_ +# / /|_/ / __/ \__ \/ /_/ / / /_/ / / _ \/ ___/ / /_/ __ \/ ___/ __ `__ \/ __ `/ __/ +# / / / / /___ ___/ / __ / / __/ / / __(__ ) / __/ /_/ / / / / / / / / /_/ / /_ +# /_/ /_/_____//____/_/ /_/ /_/ /_/_/\___/____/ /_/ \____/_/ /_/ /_/ /_/\__,_/\__/ def _recompute_min_max( diff --git a/energyml-utils/src/energyml/utils/data/mesh_numpy.py b/energyml-utils/src/energyml/utils/data/mesh_numpy.py new file mode 100644 index 0000000..bba272c --- /dev/null +++ b/energyml-utils/src/energyml/utils/data/mesh_numpy.py @@ -0,0 +1,2091 @@ +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 +"""Optimised, zero-copy-first EPC/HDF5 3-D object reader. + +This module is a high-performance companion to :mod:`mesh.py`. It keeps the +same ``read_(energyml_object, workspace)`` dispatcher philosophy but +always returns :class:`NumpyMultiMesh` containers whose geometry arrays are +:class:`numpy.ndarray` objects (never plain Python lists). + +Design goals +------------ +* **No list conversion** - no ``.tolist()`` calls anywhere. Arrays stay as + numpy throughout. +* **Best-effort zero-copy** - geometry is read via + :meth:`EnergymlStorageInterface.read_array_view`. For contiguous, + uncompressed HDF5 datasets this returns a numpy view backed directly by the + memory-mapped file buffer (no RAM copy). Chunked / compressed datasets fall + back silently to a copy. +* **PyVista-ready connectivity** - ``faces`` / ``lines`` / ``cells`` arrays + use the VTK flat-count-prefixed format consumed directly by + ``pyvista.PolyData`` and ``pyvista.UnstructuredGrid`` without additional + allocation. +* **Patch-level control** - every representation is returned as a + :class:`NumpyMultiMesh` container. Each RESQML patch becomes a separate + :class:`NumpyMesh` entry in ``NumpyMultiMesh.patches``, carrying + ``patch_index``, ``patch_label``, ``source_uuid``, and ``source_type`` + metadata. ``RepresentationSetRepresentation`` members are stored as nested + ``NumpyMultiMesh.children`` so visibility can be toggled per-child in + PyVista ``MultiBlock`` viewers. +* **Backward compatible** - :mod:`mesh.py` is untouched; both modules can be + used side by side. + +Usage +----- +>>> from energyml.utils.epc import Epc +>>> from energyml.utils.data.mesh_numpy import read_numpy_mesh_object, numpy_multi_mesh_to_pyvista +>>> epc = Epc.read_file("my_model.epc") +>>> obj = epc.get_object_by_uuid("...")[0] +>>> multi = read_numpy_mesh_object(obj, workspace=epc, use_crs_displacement=True) +>>> block = numpy_multi_mesh_to_pyvista(multi) # pyvista.MultiBlock +>>> block.plot() +""" +from __future__ import annotations + +import inspect +import logging +import re +import sys +import traceback +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np + +from energyml.utils.data.helper import ( + apply_crs_transform, + generate_vertical_well_points, + get_crs_offsets_and_angle, + get_crs_obj, + get_crs_origin_offset, + get_datum_information, + is_z_reversed, + read_array, + read_grid2d_patch, + read_parametric_geometry, + get_wellbore_points, +) +from energyml.utils.data.crs import extract_crs_info, apply_from_crs_info +from energyml.utils.exception import NotSupportedError, ObjectNotFoundNotError +from energyml.utils.introspection import ( + get_obj_uri, + get_obj_uuid, + get_object_attribute, + search_attribute_matching_name, + search_attribute_matching_name_with_path, + snake_case, +) +from energyml.utils.storage_interface import EnergymlStorageInterface + +# --------------------------------------------------------------------------- +# Internal helper: thin proxy that makes read_array_view look like read_array +# so that helper.read_array benefits from zero-copy semantics transparently. +# --------------------------------------------------------------------------- + + +class _ViewWorkspace: + """Transparent proxy that routes ``read_array`` → ``read_array_view``. + + ``helper.read_array`` internally calls ``workspace.read_array``. By + wrapping the real workspace with this proxy we redirect those calls to + :meth:`read_array_view` without touching ``helper.py``. All other + attribute accesses are forwarded as-is. + """ + + __slots__ = ("_ws",) + + def __init__(self, ws: EnergymlStorageInterface) -> None: + self._ws = ws + + def __getattr__(self, name: str) -> Any: + return getattr(self._ws, name) + + def read_array( # noqa: D102 - mirrors EnergymlStorageInterface + self, + proxy: Any, + path_in_external: str, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + external_uri: Optional[str] = None, + ) -> Optional[np.ndarray]: + return self._ws.read_array_view(proxy, path_in_external, start_indices, counts, external_uri) + + +def _view_workspace(workspace: Optional[EnergymlStorageInterface]) -> Optional[Any]: + """Wrap *workspace* in ``_ViewWorkspace`` when available, else return as-is.""" + if workspace is None: + return None + if isinstance(workspace, _ViewWorkspace): + return workspace + return _ViewWorkspace(workspace) + + +# --------------------------------------------------------------------------- +# Dataclass hierarchy +# --------------------------------------------------------------------------- + + +@dataclass +class NumpyMesh: + """Base class for all numpy-backed mesh objects. + + Subclasses guarantee: + * ``points`` - shape ``(N, 3)``, dtype ``float64`` + * Connectivity arrays - dtype ``int64``, VTK flat format + """ + + energyml_object: Any = field(default=None) + crs_object: Any = field(default=None) + identifier: str = field(default="") + #: Points array, shape (N, 3), dtype float64. May be a numpy view. + points: np.ndarray = field(default_factory=lambda: np.empty((0, 3), dtype=np.float64)) + #: Index of this patch within the source representation (0-based). + patch_index: Optional[int] = field(default=None) + #: Human-readable label for this patch. + patch_label: Optional[str] = field(default=None) + #: UUID of the source RESQML object that produced this patch. + source_uuid: Optional[str] = field(default=None) + #: Python class name of the source RESQML object. + source_type: Optional[str] = field(default=None) + #: Optional named arrays attached to this mesh (e.g. ``node_time_values``). + extra_arrays: Dict[str, np.ndarray] = field(default_factory=dict) + + def to_pyvista(self) -> Any: # return type: pv.DataSet + """Convert to a PyVista dataset. Requires ``pyvista`` to be installed.""" + return numpy_mesh_to_pyvista(self) + + +@dataclass +class NumpyPointSetMesh(NumpyMesh): + """A cloud of unconnected points.""" + + +@dataclass +class NumpyPolylineMesh(NumpyMesh): + """A set of poly-lines. + + ``lines`` uses the VTK flat format: + ``[n0, i0, i1, …, n1, j0, j1, …]`` where *n* is the vertex count of that + line. Can be passed directly to ``pyvista.PolyData(points, lines=lines)``. + """ + + lines: np.ndarray = field(default_factory=lambda: np.empty(0, dtype=np.int64)) + + +@dataclass +class NumpySurfaceMesh(NumpyMesh): + """A triangulated or quad surface. + + ``faces`` uses the VTK flat format: + ``[nv0, v0, v1, v2, nv1, v0, v1, v2, …]``. Can be passed directly to + ``pyvista.PolyData(points, faces=faces)``. + """ + + faces: np.ndarray = field(default_factory=lambda: np.empty(0, dtype=np.int64)) + + +@dataclass +class NumpyVolumeMesh(NumpyMesh): + """A volumetric mesh (hexahedral, polyhedral, …). + + ``cells`` - VTK flat format, ``cell_types`` - uint8 VTK cell-type codes. + ``pyvista.UnstructuredGrid(cells, cell_types, points)`` accepts them + directly. + """ + + cells: np.ndarray = field(default_factory=lambda: np.empty(0, dtype=np.int64)) + cell_types: np.ndarray = field(default_factory=lambda: np.empty(0, dtype=np.uint8)) + + +@dataclass +class NumpyMultiMesh: + """Container for one or more :class:`NumpyMesh` patches from a single + energyml representation, plus optional nested child containers for + ``RepresentationSetRepresentation``. + + Hierarchy + --------- + * **patches** — flat list of :class:`NumpyMesh` subclass instances + produced directly by this representation (one per RESQML patch). + * **children** — nested :class:`NumpyMultiMesh` instances; populated only + by :func:`read_numpy_representation_set_representation` (one child per + member representation). + + The design is intentionally shallow: at most 2 levels (container → + patches) except for ``RepresentationSet`` which adds one extra level. + """ + + energyml_object: Any = field(default=None) + identifier: str = field(default="") + #: UUID of the source energyml object. + source_uuid: Optional[str] = field(default=None) + #: Python class name of the source energyml object. + source_type: Optional[str] = field(default=None) + #: Ordered list of patches produced by reading this representation. + patches: List["NumpyMesh"] = field(default_factory=list) + #: Child containers (only for RepresentationSetRepresentation). + children: List["NumpyMultiMesh"] = field(default_factory=list) + + # ------------------------------------------------------------------ + # Convenience helpers + # ------------------------------------------------------------------ + + def patch_count(self) -> int: + """Total number of leaf patches (recursive across children).""" + return len(self.patches) + sum(c.patch_count() for c in self.children) + + def flat_patches(self) -> List["NumpyMesh"]: + """Return all leaf patches in depth-first order.""" + result: List[NumpyMesh] = list(self.patches) + for child in self.children: + result.extend(child.flat_patches()) + return result + + def to_pyvista(self) -> Any: # return type: pv.MultiBlock + """Convert to a PyVista ``MultiBlock``. Requires ``pyvista``.""" + return numpy_multi_mesh_to_pyvista(self) + + +# --------------------------------------------------------------------------- +# CRS displacement (vectorised) +# --------------------------------------------------------------------------- + + +def crs_displacement_np( + points: np.ndarray, + crs_obj: Any, + *, + inplace: bool = True, +) -> np.ndarray: + """Apply CRS origin offset and optional Z-axis inversion to *points*. + + Operates on an ``(N, 3)`` numpy array using broadcast arithmetic — no + Python-level loops. Prefer :func:`apply_from_crs_info` for full CRS + transforms (rotation, axis-order swap, etc.). + + Args: + points: Shape ``(N, 3)``, dtype ``float64``. Modified in-place when + *inplace* is ``True`` (default). + crs_obj: CRS object exposing the same attributes as accepted by + :func:`helper.get_crs_origin_offset` and + :func:`helper.is_z_reversed`. + inplace: When ``False`` a copy is returned and *points* is unchanged. + + Returns: + The (possibly same) array with CRS displacement applied. + """ + if crs_obj is None: + return points + + offset = get_crs_origin_offset(crs_obj=crs_obj) + z_reversed = is_z_reversed(crs_obj) + + if not np.any(offset) and not z_reversed: + return points + + if not inplace: + points = points.copy() + + off = np.asarray(offset, dtype=np.float64) # shape (3,) + points += off # broadcast: (N, 3) + (3,) + if z_reversed: + points[:, 2] *= -1.0 + + return points + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _ensure_float64_points(arr: Any) -> np.ndarray: + """Convert *arr* to ``(N, 3) float64``. + + Accepts numpy arrays (any shape that contains N*3 elements) or nested + Python lists. Returns a 2-D view/cast when possible, copy only when + dtype conversion is required. + """ + a = np.asarray(arr, dtype=np.float64) + if a.ndim == 1: + a = a.reshape(-1, 3) + elif a.ndim == 2 and a.shape[1] == 2: + # 2-D points (e.g. seismic / plan view) — pad Z column with zeros + a = np.column_stack([a, np.zeros(len(a), dtype=np.float64)]) + elif a.ndim == 2 and a.shape[1] != 3: + raise ValueError(f"Expected (N, 2) or (N, 3) points array, got shape {a.shape}") + return a + + +def _ensure_int64(arr: Any) -> np.ndarray: + """Return *arr* as a flat ``int64`` numpy array.""" + a = np.asarray(arr, dtype=np.int64) + return a.ravel() + + +def _build_vtk_faces_from_triangles(tri: np.ndarray) -> np.ndarray: + """Build VTK flat face array from ``(M, 3)`` triangle index array. + + Result: ``[3, a, b, c, 3, a, b, c, …]``. + """ + m = tri.shape[0] + counts = np.full((m, 1), 3, dtype=np.int64) + return np.concatenate([counts, tri], axis=1).ravel() + + +def _build_vtk_faces_from_quads(quad: np.ndarray) -> np.ndarray: + """Build VTK flat face array from ``(M, 4)`` quad index array. + + Result: ``[4, a, b, c, d, 4, a, b, c, d, …]``. + """ + m = quad.shape[0] + counts = np.full((m, 1), 4, dtype=np.int64) + return np.concatenate([counts, quad], axis=1).ravel() + + +def _build_vtk_lines_from_segments(n_points: int) -> np.ndarray: + """Build VTK flat lines array for a single poly-line of *n_points* nodes. + + Segments: (0,1), (1,2), …, (n-2, n-1). + Result: ``[2, 0, 1, 2, 1, 2, …]``. + """ + if n_points < 2: + return np.empty(0, dtype=np.int64) + idx = np.arange(n_points - 1, dtype=np.int64) + pairs = np.column_stack([idx, idx + 1]) # (n-1, 2) + counts = np.full((n_points - 1, 1), 2, dtype=np.int64) + return np.concatenate([counts, pairs], axis=1).ravel() + + +def _build_vtk_lines_from_node_counts(node_counts: np.ndarray) -> np.ndarray: + """Build VTK flat lines array from per-polyline node counts. + + For each polyline of length *n* we emit ``[n, 0, 1, …, n-1]`` with + indices local to the global point array (starting at the correct offset). + + Returns ``(total_entries,)`` int64 array. + """ + result_parts = [] + offset = 0 + for n in node_counts: + n = int(n) + local = np.arange(offset, offset + n, dtype=np.int64) + part = np.empty(n + 1, dtype=np.int64) + part[0] = n + part[1:] = local + result_parts.append(part) + offset += n + if not result_parts: + return np.empty(0, dtype=np.int64) + return np.concatenate(result_parts) + + +def _read_array_np( + energyml_array: Any, + root_obj: Any, + path_in_root: str, + workspace: Optional[Any], # _ViewWorkspace or EnergymlStorageInterface +) -> np.ndarray: + """Thin wrapper around :func:`helper.read_array` that guarantees ndarray output.""" + result = read_array( + energyml_array=energyml_array, + root_obj=root_obj, + path_in_root=path_in_root, + workspace=workspace, + ) + if result is None: + return np.empty(0) + if isinstance(result, np.ndarray): + return result + return np.asarray(result) + + +def _decode_jagged_array( + jagged: Any, + root_obj: Any, + base_path: str, + workspace: Optional[Any], +) -> List[np.ndarray]: + """Decode a RESQML ``JaggedArray`` into a list of numpy sub-arrays. + + ``JaggedArray`` stores data as: + * ``Elements`` — flat 1-D array of all values concatenated. + * ``CumulativeLength`` — 1-D array of end-offsets; ``CumulativeLength[i]`` + is the exclusive end index of sub-array *i* in ``Elements``. + + Returns an empty list when either component is missing. + """ + elem_list = search_attribute_matching_name_with_path(jagged, "Elements") + cum_list = search_attribute_matching_name_with_path(jagged, "CumulativeLength") + if not elem_list or not cum_list: + return [] + elem_path, elem_obj = elem_list[0] + cum_path, cum_obj = cum_list[0] + elements = _read_array_np(elem_obj, root_obj, f"{base_path}.{elem_path}", workspace) + cum_len = _read_array_np(cum_obj, root_obj, f"{base_path}.{cum_path}", workspace).astype(np.int64) + result: List[np.ndarray] = [] + prev = 0 + for c in cum_len: + c = int(c) + result.append(elements[prev:c]) + prev = c + return result + + +# --------------------------------------------------------------------------- +# Dispatcher machinery (mirrors mesh.py but prefixed with 'numpy_') +# --------------------------------------------------------------------------- + + +def _numpy_mesh_name_mapping(arr_type_name: str) -> str: + """Normalise the energyml type name to match a ``read_numpy_`` function.""" + arr_type_name = arr_type_name.replace("3D", "3d").replace("2D", "2d") + arr_type_name = re.sub(r"^[Oo]bj([A-Z])", r"\1", arr_type_name) + arr_type_name = re.sub(r"(Polyline|Point)Set", r"\1", arr_type_name) + return arr_type_name + + +def get_numpy_reader_function(mesh_type_name: str) -> Optional[Callable]: + """Return the ``read_numpy_`` function for *mesh_type_name*, or ``None``.""" + target = f"read_numpy_{snake_case(mesh_type_name)}" + for name, obj in inspect.getmembers(sys.modules[__name__]): + if name == target: + return obj + return None + + +# --------------------------------------------------------------------------- +# Representation readers +# --------------------------------------------------------------------------- + + +def read_numpy_point_representation( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, +) -> "NumpyMultiMesh": + """Read a ``PointRepresentation`` / ``PointSetRepresentation``.""" + ws = _view_workspace(workspace) + src_uuid = get_obj_uuid(energyml_object) + src_type = type(energyml_object).__name__ + multi = NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(get_obj_uri(energyml_object)), + source_uuid=src_uuid, + source_type=src_type, + ) + patch_idx = 0 + total_size = 0 + + patches_geom = search_attribute_matching_name_with_path( + energyml_object, r"NodePatch.[\d]+.Geometry.Points" + ) + search_attribute_matching_name_with_path(energyml_object, r"NodePatchGeometry.[\d]+.Points") + + for points_path_in_obj, points_obj in patches_geom: + raw = _read_array_np(points_obj, energyml_object, points_path_in_obj, ws) + points = _ensure_float64_points(raw) # (N,3) + + crs = None + try: + crs = get_crs_obj( + context_obj=points_obj, + path_in_root=points_path_in_obj, + root_obj=energyml_object, + workspace=workspace, + ) + except ObjectNotFoundNotError: + pass + + if sub_indices is not None and len(sub_indices) > 0: + t_idx = np.asarray(sub_indices, dtype=np.int64) - total_size + mask = (t_idx >= 0) & (t_idx < len(points)) + points = points[t_idx[mask]] + total_size += len(points) + + # Apply full CRS transform per patch; crs_object kept for reference, + # outer dispatcher is guarded to skip crs_displacement_np for this type. + if use_crs_displacement and crs is not None and len(points) > 0: + apply_from_crs_info(points, extract_crs_info(crs, workspace), inplace=True) + + label = f"{src_type}_patch_{patch_idx}" + multi.patches.append( + NumpyPointSetMesh( + identifier=label, + energyml_object=energyml_object, + crs_object=crs, + points=points, + patch_index=patch_idx, + patch_label=label, + source_uuid=src_uuid, + source_type=src_type, + ) + ) + patch_idx += 1 + + return multi + + +def read_numpy_polyline_representation( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, +) -> "NumpyMultiMesh": + """Read a ``PolylineRepresentation`` / ``PolylineSetRepresentation``.""" + ws = _view_workspace(workspace) + src_uuid = get_obj_uuid(energyml_object) + src_type = type(energyml_object).__name__ + multi = NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(get_obj_uri(energyml_object)), + source_uuid=src_uuid, + source_type=src_type, + ) + patch_idx = 0 + total_size = 0 + + for patch_path_in_obj, patch in search_attribute_matching_name_with_path( + energyml_object, "NodePatch" + ) + search_attribute_matching_name_with_path(energyml_object, r"LinePatch.[\d]+"): + # --- Points --- + pts_list = search_attribute_matching_name_with_path(patch, "Geometry.Points") + if not pts_list: + pts_list = search_attribute_matching_name_with_path(patch, "Points") + if not pts_list: + logging.error(f"Cannot find points for patch {patch_path_in_obj}") + continue + + points_path, points_obj = pts_list[0] + raw_pts = _read_array_np(points_obj, energyml_object, patch_path_in_obj + "." + points_path, ws) + points = _ensure_float64_points(raw_pts) # (N, 3) + + crs = None + try: + crs = get_crs_obj( + context_obj=points_obj, + path_in_root=patch_path_in_obj + "." + points_path, + root_obj=energyml_object, + workspace=workspace, + ) + except ObjectNotFoundNotError: + pass + + # --- Closed polylines flag (optional) --- + close_poly: Optional[np.ndarray] = None + try: + cp_path, cp_obj = search_attribute_matching_name_with_path(patch, "ClosedPolylines")[0] + close_poly = _read_array_np(cp_obj, energyml_object, patch_path_in_obj + "." + cp_path, ws) + except IndexError: + pass + + # --- Node counts per polyline --- + # nc_arr holds the *original* counts (before closing); used both for + # VTK-array construction and for sub_indices filtering below. + nc_arr: Optional[np.ndarray] = None + lines: np.ndarray + try: + nc_path, nc_obj = search_attribute_matching_name_with_path(patch, "NodeCountPerPolyline")[0] + nc_arr = _read_array_np(nc_obj, energyml_object, patch_path_in_obj + nc_path, ws).astype(np.int64).ravel() + + # Build VTK lines array respecting closed flags + parts: List[np.ndarray] = [] + offset = 0 + for poly_idx, n in enumerate(nc_arr): + n = int(n) + indices = np.arange(offset, offset + n, dtype=np.int64) + if close_poly is not None and poly_idx < len(close_poly) and close_poly[poly_idx]: + indices = np.append(indices, offset) # close the loop + n += 1 + part = np.empty(n + 1, dtype=np.int64) + part[0] = n + part[1:] = indices + parts.append(part) + offset += n if close_poly is None or poly_idx >= len(close_poly) or not close_poly[poly_idx] else n - 1 + lines = np.concatenate(parts) if parts else np.empty(0, dtype=np.int64) + except IndexError: + # Single polyline — all points in sequence + lines = _build_vtk_lines_from_segments(len(points)) + + # --- sub_indices filtering --- + # sub_indices select individual *polylines* (by index within this patch). + # We filter the VTK flat `lines` buffer and also subset `points` to + # keep only the nodes referenced by the surviving polylines. + if sub_indices is not None and len(sub_indices) > 0: + total_polylines = len(nc_arr) if nc_arr is not None else 1 + t_idx = np.asarray(sub_indices, dtype=np.int64) - total_size + _valid = np.sort(t_idx[(t_idx >= 0) & (t_idx < total_polylines)]) + total_size += total_polylines + + if nc_arr is not None and len(_valid) > 0: + # Walk the VTK flat buffer once to record per-polyline slice bounds. + pos = 0 + poly_slices: List[Tuple[int, int]] = [] + for _ in range(total_polylines): + n_vtk = int(lines[pos]) + poly_slices.append((pos, pos + n_vtk + 1)) + pos += n_vtk + 1 + + # Original point ranges per polyline (nc_arr gives node counts). + pt_offsets = np.concatenate([[0], np.cumsum(nc_arr)]) + + # Gather contiguous point ranges for the selected polylines. + keep_ranges = [ + np.arange(int(pt_offsets[i]), int(pt_offsets[i + 1]), dtype=np.int64) + for i in _valid + ] + keep_pts = np.concatenate(keep_ranges) if keep_ranges else np.empty(0, dtype=np.int64) + + # Build a full remapping: old_pt_idx → new_pt_idx (-1 = not kept). + new_pt_idx = np.full(len(points), -1, dtype=np.int64) + new_pt_idx[keep_pts] = np.arange(len(keep_pts), dtype=np.int64) + points = points[keep_pts] + + # Re-index VTK segments for the selected polylines. + rebuilt: List[np.ndarray] = [] + for i in _valid: + s, e = poly_slices[i] + seg = lines[s:e].copy() + seg[1:] = new_pt_idx[seg[1:]] + rebuilt.append(seg) + lines = np.concatenate(rebuilt) if rebuilt else np.empty(0, dtype=np.int64) + elif len(_valid) == 0: + points = np.empty((0, 3), dtype=np.float64) + lines = np.empty(0, dtype=np.int64) + else: + total_size += 1 # at least one polyline + + # Apply full CRS transform per patch; crs_object kept for reference, + # outer dispatcher is guarded to skip crs_displacement_np for this type. + if use_crs_displacement and crs is not None and len(points) > 0: + apply_from_crs_info(points, extract_crs_info(crs, workspace), inplace=True) + + if len(points) > 0: + label = f"{src_type}_patch_{patch_idx}" + multi.patches.append( + NumpyPolylineMesh( + identifier=label, + energyml_object=energyml_object, + crs_object=crs, + points=points, + lines=lines, + patch_index=patch_idx, + patch_label=label, + source_uuid=src_uuid, + source_type=src_type, + ) + ) + patch_idx += 1 + + return multi + + +def read_numpy_triangulated_set_representation( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, +) -> "NumpyMultiMesh": + """Read a ``TriangulatedSetRepresentation`` as numpy-backed surface meshes. + + Key differences vs :func:`mesh.read_triangulated_set_representation`: + + * No ``.tolist()`` — geometry stays in numpy arrays. + * Point-offset arithmetic is done via in-place numpy broadcast. + * VTK flat face connectivity is built with :func:`numpy.concatenate` and + :func:`numpy.column_stack` — no Python loops over triangles. + """ + ws = _view_workspace(workspace) + src_uuid = get_obj_uuid(energyml_object) + src_type = type(energyml_object).__name__ + multi = NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(get_obj_uri(energyml_object)), + source_uuid=src_uuid, + source_type=src_type, + ) + point_offset = 0 + patch_idx = 0 + total_size = 0 + + patches = search_attribute_matching_name_with_path( + energyml_object, + r"\w*Patch.\d+", + deep_search=False, + search_in_sub_obj=False, + ) + + for patch_path, patch in patches: + crs = None + try: + crs = get_crs_obj( + context_obj=patch, + path_in_root=patch_path, + root_obj=energyml_object, + workspace=workspace, + ) + except ObjectNotFoundNotError: + pass + + # --- Points --- + pts_parts: List[np.ndarray] = [] + for point_path, point_obj in search_attribute_matching_name_with_path(patch, "Geometry.Points"): + raw = _read_array_np(point_obj, energyml_object, patch_path + "." + point_path, ws) + pts_parts.append(_ensure_float64_points(raw)) + + if not pts_parts: + patch_idx += 1 + continue + points = np.concatenate(pts_parts, axis=0) # (N, 3) + + # Apply full CRS transform (rotation + offsets + z-flip + axis-swap) per patch. + # Setting crs_object=None on the resulting mesh prevents the outer + # read_numpy_mesh_object dispatcher from calling crs_displacement_np() again. + if use_crs_displacement and crs is not None and len(points) > 0: + crs_info = extract_crs_info(crs, workspace) + apply_from_crs_info(points, crs_info, inplace=True) + + # --- Triangles --- + tri_parts: List[np.ndarray] = [] + for tri_path, tri_obj in search_attribute_matching_name_with_path(patch, "Triangles"): + raw = _read_array_np(tri_obj, energyml_object, patch_path + "." + tri_path, ws) + tri_parts.append(raw.astype(np.int64).reshape(-1, 3)) + + if not tri_parts: + patch_idx += 1 + continue + triangles = np.concatenate(tri_parts, axis=0) # (M, 3) + + # Apply point offset (in-place broadcast — no copy when dtype matches) + if point_offset != 0: + triangles -= point_offset # local 0-based indices + + # sub_indices face filtering + if sub_indices is not None and len(sub_indices) > 0: + t_idx = np.asarray(sub_indices, dtype=np.int64) - total_size + mask = (t_idx >= 0) & (t_idx < len(triangles)) + triangles = triangles[t_idx[mask]] + total_size += len(triangles) + + # Build VTK flat faces array: [3, v0, v1, v2, 3, v0, v1, v2, …] + faces = _build_vtk_faces_from_triangles(triangles) + + label = f"{src_type}_patch_{patch_idx}" + multi.patches.append( + NumpySurfaceMesh( + identifier=label, + energyml_object=energyml_object, + crs_object=crs, + points=points, + faces=faces, + patch_index=patch_idx, + patch_label=label, + source_uuid=src_uuid, + source_type=src_type, + ) + ) + point_offset += len(points) + patch_idx += 1 + + return multi + + +def read_numpy_grid2d_representation( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + keep_holes: bool = False, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, +) -> "NumpyMultiMesh": + """Read a ``Grid2dRepresentation`` as a numpy quad-surface mesh. + + NaN-hole handling is done with boolean masks and cumsum-based index remapping + (O(N) vs the O(N) dict-based approach in :func:`mesh.gen_surface_grid_geometry`, + but avoids Python dict overhead for large grids). + """ + src_uuid = get_obj_uuid(energyml_object) + src_type = type(energyml_object).__name__ + multi = NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(get_obj_uri(energyml_object)), + source_uuid=src_uuid, + source_type=src_type, + ) + patch_idx = 0 + total_size = 0 + + def _process_patch(patch: Any, patch_path: str, crs: Any) -> Optional[NumpySurfaceMesh]: + nonlocal total_size, patch_idx + # read_grid2d_patch returns List[List[float]] — convert to ndarray + raw_pts = read_grid2d_patch( + patch=patch, + grid2d=energyml_object, + path_in_root=patch_path, + workspace=workspace, + ) + pts = np.asarray(raw_pts, dtype=np.float64) if raw_pts is not None else np.empty((0, 3)) + if pts.size == 0: + return None + + if pts.ndim == 1: + pts = pts.reshape(-1, 3) + + # Grid dimensions + fa_count = search_attribute_matching_name(patch, "FastestAxisCount") or search_attribute_matching_name( + energyml_object, "FastestAxisCount" + ) + sa_count = search_attribute_matching_name(patch, "SlowestAxisCount") or search_attribute_matching_name( + energyml_object, "SlowestAxisCount" + ) + if not fa_count or not sa_count: + return None + fa = int(fa_count[0]) + sa = int(sa_count[0]) + + # Clamp dimensions to actual number of points + total_pts = len(pts) + while sa * fa > total_pts and sa > 0 and fa > 0: + sa -= 1 + fa -= 1 + while sa * fa < total_pts: + sa += 1 + fa += 1 + + z_col = pts[:, 2] + nan_mask = np.isnan(z_col) # True where Z is NaN (hole) + + if keep_holes: + pts[nan_mask, 2] = 0.0 + final_pts = pts + # All original indices are valid + local_idx = np.arange(total_pts, dtype=np.int64) + remap = local_idx # identity + else: + valid_mask = ~nan_mask + final_pts = pts[valid_mask] + # remap[original_index] = final_index (-1 ⟹ invalid/NaN) + remap = np.full(total_pts, -1, dtype=np.int64) + remap[valid_mask] = np.arange(valid_mask.sum(), dtype=np.int64) + + # Build quad face list (vectorised) + quad_rows = [] + for sa_i in range(sa - 1): + for fa_i in range(fa - 1): + line = sa_i * fa + a = line + fa_i + b = line + fa_i + 1 + c = line + fa + fa_i + 1 + d = line + fa + fa_i + if keep_holes: + quad_rows.append([a, b, c, d]) + else: + ra, rb, rc, rd = remap[a], remap[b], remap[c], remap[d] + if ra >= 0 and rb >= 0 and rc >= 0 and rd >= 0: + quad_rows.append([ra, rb, rc, rd]) + + if not quad_rows: + return None + quads = np.asarray(quad_rows, dtype=np.int64) # (M, 4) + + # sub_indices filtering + if sub_indices is not None and len(sub_indices) > 0: + t_idx = np.asarray(sub_indices, dtype=np.int64) - total_size + mask = (t_idx >= 0) & (t_idx < len(quads)) + quads = quads[t_idx[mask]] + total_size += len(quads) + + faces = _build_vtk_faces_from_quads(quads) + label = f"{src_type}_patch_{patch_idx}" + mesh = NumpySurfaceMesh( + identifier=label, + energyml_object=energyml_object, + crs_object=crs, + points=final_pts, + faces=faces, + patch_index=patch_idx, + patch_label=label, + source_uuid=src_uuid, + source_type=src_type, + ) + patch_idx += 1 + return mesh + + # RESQML 2.0.1 — patches + for patch_path, patch in search_attribute_matching_name_with_path(energyml_object, "Grid2dPatch"): + crs = None + try: + crs = get_crs_obj( + context_obj=patch, + path_in_root=patch_path, + root_obj=energyml_object, + workspace=workspace, + ) + except ObjectNotFoundNotError: + pass + m = _process_patch(patch, patch_path, crs) + if m is not None: + multi.patches.append(m) + + # RESQML 2.2 — geometry directly on the object + if hasattr(energyml_object, "geometry"): + crs = None + try: + crs = get_crs_obj( + context_obj=energyml_object, + path_in_root=".", + root_obj=energyml_object, + workspace=workspace, + ) + except ObjectNotFoundNotError as e: + logging.error(e) + m = _process_patch(energyml_object, "", crs) + if m is not None: + multi.patches.append(m) + + return multi + + +def read_numpy_wellbore_trajectory_representation( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, + wellbore_frame_mds: Optional[Union[List[float], np.ndarray]] = None, + step_meter: float = 5.0, +) -> "NumpyMultiMesh": + """Read a ``WellboreTrajectoryRepresentation`` as a numpy polyline mesh.""" + if energyml_object is None: + return NumpyMultiMesh(identifier="empty_wellbore_trajectory") + + if isinstance(energyml_object, list): + synthetic = NumpyMultiMesh(identifier="WellboreTrajectoryRepresentation_list") + for obj in energyml_object: + synthetic.children.append( + read_numpy_wellbore_trajectory_representation( + obj, workspace, use_crs_displacement, sub_indices, wellbore_frame_mds, step_meter + ) + ) + return synthetic + + crs = None + head_x = head_y = head_z = 0.0 + z_increasing_downward = False + + try: + crs_attr = get_object_attribute(energyml_object, "geometry.LocalCrs") + if crs_attr is not None: + crs = workspace.get_object(get_obj_uri(crs_attr)) + else: + raise ObjectNotFoundNotError("LocalCrs not found") + except Exception: + logging.debug("Could not get CRS from trajectory geometry") + + # MD datum / reference point (fixes always-at-origin bug) + try: + md_datum_dor = None + try: + md_datum_dor = search_attribute_matching_name(obj=energyml_object, name_rgx=r"MdDatum")[0] + except IndexError: + try: + md_datum_dor = search_attribute_matching_name(obj=energyml_object, name_rgx=r"MdInterval.Datum")[0] + except IndexError: + pass + + if md_datum_dor is not None: + md_datum_identifier = get_obj_uri(md_datum_dor) + md_datum_obj = workspace.get_object(md_datum_identifier) if workspace else None + if md_datum_obj is not None: + head_x, head_y, head_z, z_increasing_downward, _, _, crs = get_datum_information( + md_datum_obj, workspace + ) + except Exception as e: + logging.debug(f"Could not resolve MdDatum from trajectory: {e}") + + try: + crs_info = extract_crs_info(crs, workspace) + traj_mds, traj_points, traj_tangents = read_parametric_geometry( + getattr(energyml_object, "geometry", None), workspace + ) + well_points_list = get_wellbore_points(wellbore_frame_mds, traj_mds, traj_points, traj_tangents, step_meter) + if use_crs_displacement: + well_points_list = apply_from_crs_info( + np.asarray(well_points_list, dtype=np.float64), + crs_info, + ) + except Exception as e: + if wellbore_frame_mds is not None: + logging.debug(f"Trajectory parametric geometry unavailable, treating as vertical: {e}") + well_points_list = generate_vertical_well_points( + head_x=head_x, + head_y=head_y, + head_z=head_z, + wellbore_mds=wellbore_frame_mds + if isinstance(wellbore_frame_mds, np.ndarray) + else np.asarray(wellbore_frame_mds), + z_increasing_downward=z_increasing_downward, + ) + else: + traceback.print_exc() + raise ValueError( + "Cannot read WellboreTrajectoryRepresentation: " + "no parametric geometry and no measured depth information available." + ) + + if well_points_list is None or len(well_points_list) == 0: + return NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(get_obj_uri(energyml_object)), + source_uuid=get_obj_uuid(energyml_object), + source_type=type(energyml_object).__name__, + ) + + pts = _ensure_float64_points(np.asarray(well_points_list, dtype=np.float64)) + lines = _build_vtk_lines_from_segments(len(pts)) + src_uuid = get_obj_uuid(energyml_object) + src_type = type(energyml_object).__name__ + label = f"{src_type}_patch_0" + return NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(get_obj_uri(energyml_object)), + source_uuid=src_uuid, + source_type=src_type, + patches=[ + NumpyPolylineMesh( + identifier=label, + energyml_object=energyml_object, + crs_object=crs, + points=pts, + lines=lines, + patch_index=0, + patch_label=label, + source_uuid=src_uuid, + source_type=src_type, + ) + ], + ) + + +def read_numpy_wellbore_frame_representation( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, +) -> "NumpyMultiMesh": + """Read a ``WellboreFrameRepresentation`` as a numpy polyline mesh.""" + ws = _view_workspace(workspace) + empty = NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(get_obj_uri(energyml_object)), + source_uuid=get_obj_uuid(energyml_object), + source_type=type(energyml_object).__name__, + ) + + try: + node_md_path, node_md_obj = search_attribute_matching_name_with_path(energyml_object, "NodeMd")[0] + wellbore_frame_mds = _read_array_np(node_md_obj, energyml_object, node_md_path, ws) + if not isinstance(wellbore_frame_mds, np.ndarray): + wellbore_frame_mds = np.asarray(wellbore_frame_mds, dtype=np.float64) + except (IndexError, AttributeError) as e: + logging.warning(f"Could not read NodeMd from wellbore frame: {e}") + return empty + + md_min = float(wellbore_frame_mds.min()) if len(wellbore_frame_mds) > 0 else 0.0 + md_max = float(wellbore_frame_mds.max()) if len(wellbore_frame_mds) > 0 else 0.0 + + try: + _md_min = get_object_attribute(energyml_object, "md_interval.md_min") + if _md_min is not None: + md_min = float(_md_min) + _md_max = get_object_attribute(energyml_object, "md_interval.md_max") + if _md_max is not None: + md_max = float(_md_max) + except AttributeError: + pass + + wellbore_frame_mds = wellbore_frame_mds[(wellbore_frame_mds >= md_min) & (wellbore_frame_mds <= md_max)] + + trajectory_dor = search_attribute_matching_name(obj=energyml_object, name_rgx="Trajectory")[0] + trajectory_obj = workspace.get_object(get_obj_uri(trajectory_dor)) + + result = read_numpy_wellbore_trajectory_representation( + energyml_object=trajectory_obj, + workspace=workspace, + use_crs_displacement=use_crs_displacement, + sub_indices=sub_indices, + wellbore_frame_mds=wellbore_frame_mds, + ) + frame_uri = str(get_obj_uri(energyml_object)) + for m in result.flat_patches(): + m.identifier = frame_uri + result.identifier = frame_uri + result.source_uuid = get_obj_uuid(energyml_object) + result.source_type = type(energyml_object).__name__ + return result + + +def read_numpy_sub_representation( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, +) -> "NumpyMultiMesh": + """Delegate to the supporting representation with filtered indices.""" + ws = _view_workspace(workspace) + supporting_rep_dor = search_attribute_matching_name( + obj=energyml_object, name_rgx=r"(SupportingRepresentation|RepresentedObject)" + )[0] + supporting_rep = workspace.get_object(get_obj_uri(supporting_rep_dor)) + + total_size = 0 + all_indices: Optional[np.ndarray] = None + for patch_path, patch_indices in search_attribute_matching_name_with_path( + obj=energyml_object, + name_rgx=r"SubRepresentationPatch.\d+.ElementIndices.\d+.Indices", + deep_search=False, + search_in_sub_obj=False, + ) + search_attribute_matching_name_with_path( + obj=energyml_object, + name_rgx=r"SubRepresentationPatch.\d+.Indices", + deep_search=False, + search_in_sub_obj=False, + ): + arr = _read_array_np(patch_indices, energyml_object, patch_path, ws).astype(np.int64).ravel() + if sub_indices is not None and len(sub_indices) > 0: + t_idx = np.asarray(sub_indices, dtype=np.int64) - total_size + mask = (t_idx >= 0) & (t_idx < len(arr)) + arr = arr[t_idx[mask]] + total_size += len(arr) + all_indices = np.concatenate([all_indices, arr]) if all_indices is not None else arr + + inner = read_numpy_mesh_object( + energyml_object=supporting_rep, + workspace=workspace, + use_crs_displacement=use_crs_displacement, + sub_indices=all_indices.tolist() if all_indices is not None else None, + ) + sub_uri = str(get_obj_uri(energyml_object)) + for m in inner.flat_patches(): + m.identifier = f"sub_rep_{sub_uri}/{m.identifier}" + return NumpyMultiMesh( + energyml_object=energyml_object, + identifier=sub_uri, + source_uuid=get_obj_uuid(energyml_object), + source_type=type(energyml_object).__name__, + patches=[], + children=[inner], + ) + + +def read_numpy_representation_set_representation( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, +) -> "NumpyMultiMesh": + """Delegate to each child representation; nest results as children.""" + multi = NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(get_obj_uri(energyml_object)), + source_uuid=get_obj_uuid(energyml_object), + source_type=type(energyml_object).__name__, + ) + repr_list = get_object_attribute(energyml_object, "representation") + if repr_list is None or not isinstance(repr_list, list): + return multi + for repr_dor in repr_list: + rpr_uri = get_obj_uri(repr_dor) + repr_obj = workspace.get_object(rpr_uri) + if repr_obj is None: + logging.error(f"Representation {rpr_uri} not found in RepresentationSetRepresentation") + continue + child = read_numpy_mesh_object( + energyml_object=repr_obj, + workspace=workspace, + use_crs_displacement=use_crs_displacement, + ) + multi.children.append(child) + return multi + + +# --------------------------------------------------------------------------- +# VTK cell-type codes (subset used by RESQML readers) +# --------------------------------------------------------------------------- + +_VTK_TETRA = 10 +_VTK_HEXAHEDRON = 12 +_VTK_WEDGE = 13 +_VTK_PYRAMID = 14 +_VTK_POLYHEDRON = 42 + + +# --------------------------------------------------------------------------- +# New representation readers +# --------------------------------------------------------------------------- + + +def read_numpy_plane_set_representation( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, + horizontal_plane_half_extent: float = 1e5, +) -> "NumpyMultiMesh": + """Read a ``PlaneSetRepresentation`` into numpy surface meshes. + + * ``HorizontalPlaneGeometry`` — synthesises a large finite quad centred at the + CRS origin at the given Z coordinate. The half-extent is controlled by + *horizontal_plane_half_extent* (default 100 km in CRS length units). + * ``TiltedPlaneGeometry`` — each ``ThreePoint3D`` entry becomes a triangle. + + Args: + horizontal_plane_half_extent: Half-width in CRS length units of the + synthesised quad used for ``HorizontalPlaneGeometry`` patches. + """ + src_uuid = get_obj_uuid(energyml_object) + src_type = type(energyml_object).__name__ + multi = NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(get_obj_uri(energyml_object)), + source_uuid=src_uuid, + source_type=src_type, + ) + + crs = None + try: + crs = get_crs_obj( + context_obj=energyml_object, + path_in_root=".", + root_obj=energyml_object, + workspace=workspace, + ) + except (ObjectNotFoundNotError, Exception): + pass + + planes_list = search_attribute_matching_name_with_path(energyml_object, "Planes") + patch_idx = 0 + + for _plane_path, plane_geom in planes_list: + geom_type = type(plane_geom).__name__ + + if geom_type == "HorizontalPlaneGeometry": + z = float(getattr(plane_geom, "coordinate", 0.0)) + hx = hy = float(horizontal_plane_half_extent) + points = np.array( + [[-hx, -hy, z], [hx, -hy, z], [hx, hy, z], [-hx, hy, z]], + dtype=np.float64, + ) + faces = np.array([4, 0, 1, 2, 3], dtype=np.int64) + + elif geom_type == "TiltedPlaneGeometry": + pts_list: List[np.ndarray] = [] + tri_list: List[List[int]] = [] + pt_offset = 0 + for three_pt in getattr(plane_geom, "plane", []): + pts3 = getattr(three_pt, "point3d", []) + if len(pts3) < 3: + continue + tri_pts = np.array( + [[p.coordinate1, p.coordinate2, p.coordinate3] for p in pts3[:3]], + dtype=np.float64, + ) + pts_list.append(tri_pts) + tri_list.append([pt_offset, pt_offset + 1, pt_offset + 2]) + pt_offset += 3 + if not pts_list: + patch_idx += 1 + continue + points = np.concatenate(pts_list, axis=0) + tris = np.array(tri_list, dtype=np.int64) # (M, 3) + faces = _build_vtk_faces_from_triangles(tris) + + else: + logging.warning(f"PlaneSetRepresentation: unknown geometry type {geom_type!r} — skipping patch {patch_idx}") + patch_idx += 1 + continue + + if use_crs_displacement and crs is not None and len(points) > 0: + apply_from_crs_info(points, extract_crs_info(crs, workspace), inplace=True) + + label = f"{src_type}_patch_{patch_idx}" + multi.patches.append( + NumpySurfaceMesh( + identifier=label, + energyml_object=energyml_object, + crs_object=crs, + points=points, + faces=faces, + patch_index=patch_idx, + patch_label=label, + source_uuid=src_uuid, + source_type=src_type, + ) + ) + patch_idx += 1 + + return multi + + +def read_numpy_seismic_wellbore_frame_representation( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, +) -> "NumpyMultiMesh": + """Read a ``SeismicWellboreFrameRepresentation``. + + ``SeismicWellboreFrameRepresentation`` extends ``WellboreFrameRepresentation`` + and adds a ``NodeTimeValues`` array (one time value per frame node). This + reader delegates geometry to :func:`read_numpy_wellbore_frame_representation` + and stores the extra time values in ``patch.extra_arrays["node_time_values"]`` + on every returned patch. + """ + ws = _view_workspace(workspace) + result = read_numpy_wellbore_frame_representation( + energyml_object=energyml_object, + workspace=workspace, + use_crs_displacement=use_crs_displacement, + sub_indices=sub_indices, + ) + # Attach NodeTimeValues to each patch as extra data + try: + ntv_path, ntv_obj = search_attribute_matching_name_with_path(energyml_object, "NodeTimeValues")[0] + node_time_values = _read_array_np(ntv_obj, energyml_object, ntv_path, ws) + for patch in result.flat_patches(): + patch.extra_arrays["node_time_values"] = node_time_values + except (IndexError, Exception) as exc: + logging.warning( + f"SeismicWellboreFrameRepresentation: could not read NodeTimeValues: {exc}" + ) + result.source_type = type(energyml_object).__name__ + return result + + +def read_numpy_sealed_surface_framework_representation( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, +) -> "NumpyMultiMesh": + """Read a ``SealedSurfaceFrameworkRepresentation``. + + ``SealedSurfaceFrameworkRepresentation`` is a subtype of + ``RepresentationSetRepresentation`` (via ``AbstractSurfaceFrameworkRepresentation``). + Geometry is delegated to :func:`read_numpy_representation_set_representation` + which reads each member representation. + """ + result = read_numpy_representation_set_representation( + energyml_object=energyml_object, + workspace=workspace, + use_crs_displacement=use_crs_displacement, + sub_indices=sub_indices, + ) + result.source_type = type(energyml_object).__name__ + return result + + +# --------------------------------------------------------------------------- +# IJK-grid helpers +# --------------------------------------------------------------------------- + + +def _build_kl_mapping( + nk: int, + gap_after: Optional[np.ndarray], +) -> Tuple[np.ndarray, np.ndarray]: + """Compute bottom and top NKL boundary indices for each K cell. + + Without K-gaps the mapping is trivial: cell k spans NKL nodes [k, k+1]. + When ``gap_after[k]`` is True, the NKL counter is incremented by an extra + step between layers k and k+1, so the affected layers use distinct node + intervals that are geometrically discontinuous. + + Args: + nk: Number of K cells (not layers). + gap_after: Boolean array of length ``nk - 1``; ``True`` at index *k* + means there is a K-gap after layer *k*. + + Returns: + ``(kl_bottom, kl_top)`` — two ``(nk,)`` int64 arrays giving the NKL + index of the bottom and top node boundary for each cell. + """ + kl_bottom = np.zeros(nk, dtype=np.int64) + kl_top = np.zeros(nk, dtype=np.int64) + kl = 0 + for k in range(nk): + kl_bottom[k] = kl + kl += 1 + kl_top[k] = kl + if gap_after is not None and k < len(gap_after) and gap_after[k]: + kl += 1 # skip one NKL slot for the gap + return kl_bottom, kl_top + + +def _build_split_pillar_map( + ni: int, + nj: int, + pillar_indices_arr: np.ndarray, + columns_per_split: List[np.ndarray], + n_splits: int, +) -> np.ndarray: + """Build a per-column corner-pillar remapping for split coordinate lines. + + For each column ``(j, i)`` the four corners are labelled:: + + TL = (j, i) TR = (j, i+1) + BL = (j+1, i) BR = (j+1, i+1) + + Without splits every corner maps to the standard pillar index + ``j*(ni+1)+i``. Split coordinate lines displace this mapping for the + affected columns. + + Args: + ni, nj: Cell counts in I and J. + pillar_indices_arr: ``(n_splits,)`` int64 — original pillar index for + each split coordinate line. + columns_per_split: Length-``n_splits`` list of int arrays — column + indices (flat, ``j*ni+i``) that use each split line. + n_splits: Number of split coordinate lines. + + Returns: + ``pillar_map`` — shape ``(nj, ni, 4)`` int64; corner order is + ``[TL, TR, BL, BR]``. + """ + n_pillars_base = (ni + 1) * (nj + 1) + pillar_map = np.zeros((nj, ni, 4), dtype=np.int64) + for j in range(nj): + for i in range(ni): + pillar_map[j, i, 0] = j * (ni + 1) + i # TL + pillar_map[j, i, 1] = j * (ni + 1) + (i + 1) # TR + pillar_map[j, i, 2] = (j + 1) * (ni + 1) + i # BL + pillar_map[j, i, 3] = (j + 1) * (ni + 1) + (i + 1) # BR + + for split_idx in range(n_splits): + if split_idx >= len(columns_per_split): + break + orig_pillar_idx = int(pillar_indices_arr[split_idx]) + orig_j = orig_pillar_idx // (ni + 1) + orig_i = orig_pillar_idx % (ni + 1) + new_pillar_idx = n_pillars_base + split_idx + for col_flat in columns_per_split[split_idx].astype(np.int64): + col_j = int(col_flat) // ni + col_i = int(col_flat) % ni + if not (0 <= col_j < nj and 0 <= col_i < ni): + continue + # Identify which corner of this column corresponds to (orig_j, orig_i) + if orig_j == col_j and orig_i == col_i: + pillar_map[col_j, col_i, 0] = new_pillar_idx # TL + elif orig_j == col_j and orig_i == col_i + 1: + pillar_map[col_j, col_i, 1] = new_pillar_idx # TR + elif orig_j == col_j + 1 and orig_i == col_i: + pillar_map[col_j, col_i, 2] = new_pillar_idx # BL + elif orig_j == col_j + 1 and orig_i == col_i + 1: + pillar_map[col_j, col_i, 3] = new_pillar_idx # BR + + return pillar_map + + +def read_numpy_ijk_grid_representation( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, +) -> "NumpyMultiMesh": + """Read an ``IjkGridRepresentation`` as a :class:`NumpyVolumeMesh`. + + Geometry is reconstructed from the pillar (coordinate-line) nodes stored in + ``geometry.Points``. The cells returned are always ``VTK_HEXAHEDRON`` + (type 12), which is the correct topology for RESQML IJK corner-point grids. + + Full-fidelity features + ---------------------- + * **K-Gaps** — ``kgaps.gap_after_layer`` is decoded so that K-gap-separated + layers use the correct NKL node-boundary interval. + * **Split coordinate lines (faults)** — ``column_layer_split_coordinate_lines`` + is decoded to remap per-column corner pillars to their fault-split + equivalents. The faulted case uses a Python loop (not fully vectorised) + because the remapping is column-specific; for large grids prefer the + unfaulted vectorised path when possible. + * **Degenerate cells** — pillars with co-located nodes (e.g. wedge columns) + are preserved; PyVista tolerates degenerate hex nodes. + + Known limitation + ---------------- + ``Point3DParametricArray`` pillar geometry is not yet supported (only + ``Point3DExternalArray`` — direct HDF5 XYZ coordinates — is handled). A + :exc:`~energyml.utils.exception.NotSupportedError` is raised for parametric + grids. + """ + ws = _view_workspace(workspace) + src_uuid = get_obj_uuid(energyml_object) + src_type = type(energyml_object).__name__ + + ni = getattr(energyml_object, "ni", None) + nj = getattr(energyml_object, "nj", None) + nk = getattr(energyml_object, "nk", None) + if ni is None or nj is None or nk is None: + logging.warning("IjkGridRepresentation: ni/nj/nk not set — returning empty mesh") + return NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(src_uuid), + source_uuid=src_uuid, + source_type=src_type, + ) + ni, nj, nk = int(ni), int(nj), int(nk) + + geom = getattr(energyml_object, "geometry", None) + if geom is None: + logging.warning("IjkGridRepresentation has no geometry — returning empty mesh") + return NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(src_uuid), + source_uuid=src_uuid, + source_type=src_type, + ) + + try: + _obj_identifier = str(get_obj_uri(energyml_object)) + except Exception: + _obj_identifier = str(src_uuid) + empty = NumpyMultiMesh( + energyml_object=energyml_object, + identifier=_obj_identifier, + source_uuid=src_uuid, + source_type=src_type, + ) + + # --- K-GAPS --- + kgaps_obj = getattr(energyml_object, "kgaps", None) + gap_after: Optional[np.ndarray] = None + n_kgaps = 0 + if kgaps_obj is not None: + n_kgaps = int(getattr(kgaps_obj, "count", 0) or 0) + gap_attr_list = search_attribute_matching_name_with_path(kgaps_obj, "GapAfterLayer") + if gap_attr_list: + gap_path, gap_obj = gap_attr_list[0] + gap_after = _read_array_np(gap_obj, energyml_object, f"kgaps.{gap_path}", ws).astype(bool) + nkl = nk + n_kgaps + 1 # total number of K-boundary layers + + kl_bottom, kl_top = _build_kl_mapping(nk, gap_after) + + # --- SPLIT COORDINATE LINES --- + split_cl = getattr(geom, "column_layer_split_coordinate_lines", None) + n_splits = 0 + pillar_indices_arr: Optional[np.ndarray] = None + columns_per_split: List[np.ndarray] = [] + if split_cl is not None: + n_splits = int(getattr(split_cl, "count", 0) or 0) + if n_splits > 0: + pi_list = search_attribute_matching_name_with_path(split_cl, "PillarIndices") + if pi_list: + pi_path, pi_obj = pi_list[0] + pillar_indices_arr = _read_array_np( + pi_obj, energyml_object, + f"geometry.column_layer_split_coordinate_lines.{pi_path}", ws, + ) + cps_obj = getattr(split_cl, "columns_per_split_coordinate_line", None) + if cps_obj is not None: + columns_per_split = _decode_jagged_array( + cps_obj, energyml_object, + "geometry.column_layer_split_coordinate_lines.columns_per_split_coordinate_line", + ws, + ) + + n_pillars_base = (ni + 1) * (nj + 1) + n_pillars_total = n_pillars_base + n_splits + + # --- POINTS --- + pts_results = search_attribute_matching_name_with_path(geom, "Points") + if not pts_results: + logging.warning("IjkGridRepresentation: cannot find Points in geometry") + return empty + pts_path, pts_obj = pts_results[0] + + # Reject parametric arrays (not yet supported) + if "Parametric" in type(pts_obj).__name__: + raise NotSupportedError( + f"IjkGridRepresentation with parametric-pillar geometry " + f"({type(pts_obj).__name__}) is not yet supported. " + "Only direct HDF5 coordinate arrays (Point3DExternalArray) are handled." + ) + + raw_pts = _read_array_np(pts_obj, energyml_object, f"geometry.{pts_path}", ws) + + # Reshape raw points: HDF5 layout is (NKL, NJ+1, NI+1, 3) for unfaulted + # grids and (NKL, n_pillars_total, 3) when split lines are present. + expected_3d = nkl * n_pillars_total * 3 + expected_4d = nkl * (nj + 1) * (ni + 1) * 3 + if n_splits > 0 or raw_pts.size == expected_3d: + # Split lines present (or data already in 3-D layout) + pts_3d = raw_pts.reshape(nkl, n_pillars_total, 3) + points = pts_3d.reshape(-1, 3).astype(np.float64, copy=False) + elif raw_pts.size == expected_4d: + # Standard 4-D unfaulted layout: (NKL, NJ+1, NI+1, 3) + pts_4d = raw_pts.reshape(nkl, nj + 1, ni + 1, 3) + # Reorder to (NKL, n_pillars_base, 3) for uniform node indexing + # Pillar index: j*(ni+1)+i → this matches C-order of the last two dims + points = pts_4d.reshape(nkl, n_pillars_base, 3).reshape(-1, 3).astype(np.float64, copy=False) + else: + raise ValueError( + f"IjkGridRepresentation: unexpected points array size {raw_pts.size}. " + f"Expected {expected_3d} (3-D layout, nkl={nkl}, n_pillars={n_pillars_total}) " + f"or {expected_4d} (4-D layout, nkl={nkl}, nj+1={nj+1}, ni+1={ni+1})." + ) + + # --- CRS --- + crs = None + try: + crs = get_crs_obj( + context_obj=geom, + path_in_root="geometry", + root_obj=energyml_object, + workspace=workspace, + ) + except (ObjectNotFoundNotError, Exception): + pass + + # --- PILLAR MAP for faulted grids --- + use_pillar_map = n_splits > 0 and pillar_indices_arr is not None + pillar_map: Optional[np.ndarray] = None + if use_pillar_map: + pillar_map = _build_split_pillar_map(ni, nj, pillar_indices_arr, columns_per_split, n_splits) + + # --- BUILD HEXAHEDRAL CELL CONNECTIVITY --- + if pillar_map is None: + # Fully vectorised path for unfaulted grids + ii_arr, ij_arr, ik_arr = np.meshgrid( + np.arange(ni, dtype=np.int64), + np.arange(nj, dtype=np.int64), + np.arange(nk, dtype=np.int64), + indexing="ij", + ) # each shape (ni, nj, nk) + + kl_b = kl_bottom[ik_arr] # (ni, nj, nk) + kl_t = kl_top[ik_arr] + p_tl = ij_arr * (ni + 1) + ii_arr # pillar TL + p_tr = ij_arr * (ni + 1) + (ii_arr + 1) # pillar TR + p_bl = (ij_arr + 1) * (ni + 1) + ii_arr # pillar BL + p_br = (ij_arr + 1) * (ni + 1) + (ii_arr + 1) # pillar BR + + def _nidx(kl, pl): + return kl * n_pillars_total + pl + + # VTK_HEXAHEDRON node ordering (bottom face ccw, top face aligned) + n0 = _nidx(kl_b, p_tl).ravel() + n1 = _nidx(kl_b, p_tr).ravel() + n2 = _nidx(kl_b, p_br).ravel() + n3 = _nidx(kl_b, p_bl).ravel() + n4 = _nidx(kl_t, p_tl).ravel() + n5 = _nidx(kl_t, p_tr).ravel() + n6 = _nidx(kl_t, p_br).ravel() + n7 = _nidx(kl_t, p_bl).ravel() + + n_cells = ni * nj * nk + count_col = np.full(n_cells, 8, dtype=np.int64) + cells = np.column_stack([count_col, n0, n1, n2, n3, n4, n5, n6, n7]).ravel() + cell_types = np.full(n_cells, _VTK_HEXAHEDRON, dtype=np.uint8) + + else: + # Per-column loop for faulted grids (pillar_map resolved) + cells_parts: List[int] = [] + for ij_idx in range(nj): + for ii_idx in range(ni): + p_tl = int(pillar_map[ij_idx, ii_idx, 0]) + p_tr = int(pillar_map[ij_idx, ii_idx, 1]) + p_bl = int(pillar_map[ij_idx, ii_idx, 2]) + p_br = int(pillar_map[ij_idx, ii_idx, 3]) + for ik_idx in range(nk): + kl_b = int(kl_bottom[ik_idx]) + kl_t = int(kl_top[ik_idx]) + n0 = kl_b * n_pillars_total + p_tl + n1 = kl_b * n_pillars_total + p_tr + n2 = kl_b * n_pillars_total + p_br + n3 = kl_b * n_pillars_total + p_bl + n4 = kl_t * n_pillars_total + p_tl + n5 = kl_t * n_pillars_total + p_tr + n6 = kl_t * n_pillars_total + p_br + n7 = kl_t * n_pillars_total + p_bl + cells_parts.extend([8, n0, n1, n2, n3, n4, n5, n6, n7]) + cells = np.array(cells_parts, dtype=np.int64) + n_cells = ni * nj * nk + cell_types = np.full(n_cells, _VTK_HEXAHEDRON, dtype=np.uint8) + + if use_crs_displacement and crs is not None and len(points) > 0: + apply_from_crs_info(points, extract_crs_info(crs, workspace), inplace=True) + + label = f"{src_type}_patch_0" + multi = NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(get_obj_uri(energyml_object)), + source_uuid=src_uuid, + source_type=src_type, + ) + multi.patches.append( + NumpyVolumeMesh( + identifier=label, + energyml_object=energyml_object, + crs_object=crs, + points=points, + cells=cells, + cell_types=cell_types, + patch_index=0, + patch_label=label, + source_uuid=src_uuid, + source_type=src_type, + ) + ) + return multi + + +def read_numpy_unstructured_grid_representation( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, +) -> "NumpyMultiMesh": + """Read an ``UnstructuredGridRepresentation`` as a :class:`NumpyVolumeMesh`. + + All cells are emitted as ``VTK_POLYHEDRON`` (type 42) regardless of the + ``cell_shape`` metadata. This avoids the complex winding-order reconstruction + required to convert RESQML's face-based topology to VTK's fixed-topology node + lists (TETRA/PYRAMID/WEDGE/HEX). The polyhedron format is lossless and + PyVista can display and process these cells natively. + + The ``cell_face_is_right_handed`` boolean array is respected: faces whose flag + is ``False`` have their node ordering reversed so that all face normals point + outward from the cell. + """ + ws = _view_workspace(workspace) + src_uuid = get_obj_uuid(energyml_object) + src_type = type(energyml_object).__name__ + + geom = getattr(energyml_object, "geometry", None) + if geom is None: + logging.warning("UnstructuredGridRepresentation has no geometry — returning empty mesh") + return NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(src_uuid), + source_uuid=src_uuid, + source_type=src_type, + ) + + try: + _obj_identifier = str(get_obj_uri(energyml_object)) + except Exception: + _obj_identifier = str(src_uuid) + empty = NumpyMultiMesh( + energyml_object=energyml_object, + identifier=_obj_identifier, + source_uuid=src_uuid, + source_type=src_type, + ) + + # --- POINTS --- + pts_results = search_attribute_matching_name_with_path(geom, "Points") + if not pts_results: + logging.warning("UnstructuredGridRepresentation: cannot find Points in geometry") + return empty + pts_path, pts_obj = pts_results[0] + raw_pts = _read_array_np(pts_obj, energyml_object, pts_path, ws) + points = _ensure_float64_points(raw_pts) # (N, 3) + + # --- CRS --- + crs = None + try: + crs = get_crs_obj( + context_obj=geom, + path_in_root="geometry", + root_obj=energyml_object, + workspace=workspace, + ) + except (ObjectNotFoundNotError, Exception): + pass + + # --- JAGGED ARRAYS --- + npf_obj = getattr(geom, "nodes_per_face", None) + fpc_obj = getattr(geom, "faces_per_cell", None) + if npf_obj is None or fpc_obj is None: + logging.warning( + "UnstructuredGridRepresentation: missing nodes_per_face or faces_per_cell " + "— returning point-set mesh" + ) + label = f"{src_type}_patch_0" + multi = NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(get_obj_uri(energyml_object)), + source_uuid=src_uuid, + source_type=src_type, + ) + multi.patches.append( + NumpyPointSetMesh( + identifier=label, + energyml_object=energyml_object, + crs_object=crs, + points=points, + patch_index=0, + patch_label=label, + source_uuid=src_uuid, + source_type=src_type, + ) + ) + return multi + + nodes_per_face = _decode_jagged_array(npf_obj, energyml_object, "geometry.nodes_per_face", ws) + faces_per_cell = _decode_jagged_array(fpc_obj, energyml_object, "geometry.faces_per_cell", ws) + cell_count = len(faces_per_cell) + if cell_count == 0: + return empty + + # --- RIGHT-HANDED BOOLEAN ARRAY --- + rh_arr: Optional[np.ndarray] = None + try: + rh_path, rh_obj = search_attribute_matching_name_with_path(geom, "CellFaceIsRightHanded")[0] + rh_arr = _read_array_np(rh_obj, energyml_object, f"geometry.{rh_path}", ws).astype(bool) + except (IndexError, Exception) as exc: + logging.debug(f"UnstructuredGridRepresentation: CellFaceIsRightHanded not readable: {exc}") + + # --- BUILD VTK_POLYHEDRON CELL ARRAY --- + # VTK polyhedron flat format per cell: + # [total_vals, n_faces, n_pts_f0, p0, p1, ..., n_pts_f1, p0, ...] + # where total_vals = 1 + n_faces + sum(1 + n_pts_fi for each face). + cells_flat: List[int] = [] + rh_global_idx = 0 + + for face_idxs in faces_per_cell: + face_idxs = face_idxs.astype(np.int64) + cell_inner: List[int] = [int(len(face_idxs))] # n_faces + for fi in face_idxs: + fi = int(fi) + if fi >= len(nodes_per_face): + rh_global_idx += 1 + continue + node_idxs = nodes_per_face[fi].astype(np.int64) + if rh_arr is not None and rh_global_idx < len(rh_arr) and not rh_arr[rh_global_idx]: + node_idxs = node_idxs[::-1] # flip to outward normal + rh_global_idx += 1 + cell_inner.append(int(len(node_idxs))) + cell_inner.extend(int(x) for x in node_idxs) + cells_flat.append(len(cell_inner)) # total size of this cell entry + cells_flat.extend(cell_inner) + + cells = np.array(cells_flat, dtype=np.int64) + cell_types = np.full(cell_count, _VTK_POLYHEDRON, dtype=np.uint8) + + if use_crs_displacement and crs is not None and len(points) > 0: + apply_from_crs_info(points, extract_crs_info(crs, workspace), inplace=True) + + label = f"{src_type}_patch_0" + multi = NumpyMultiMesh( + energyml_object=energyml_object, + identifier=str(get_obj_uri(energyml_object)), + source_uuid=src_uuid, + source_type=src_type, + ) + multi.patches.append( + NumpyVolumeMesh( + identifier=label, + energyml_object=energyml_object, + crs_object=crs, + points=points, + cells=cells, + cell_types=cell_types, + patch_index=0, + patch_label=label, + source_uuid=src_uuid, + source_type=src_type, + ) + ) + return multi + + +# --------------------------------------------------------------------------- +# Main dispatcher +# --------------------------------------------------------------------------- + + +def read_numpy_mesh_object( + energyml_object: Any, + workspace: Optional[EnergymlStorageInterface] = None, + use_crs_displacement: bool = True, + sub_indices: Optional[Union[List[int], np.ndarray]] = None, +) -> "NumpyMultiMesh": + """Dispatcher — equivalent to :func:`mesh.read_mesh_object` but returns + a :class:`NumpyMultiMesh` container. + + Args: + energyml_object: Any supported RESQML/EnergyML geometry/representation object. + workspace: Storage interface (``Epc`` or ``EpcStreamReader``). + use_crs_displacement: When ``True`` (default), applies + :func:`crs_displacement_np` to the points of every + returned mesh (excluding wellbore representations + which apply the transform internally). + sub_indices: Optional list of face/line/point indices to include. + + Returns: + :class:`NumpyMultiMesh` containing one or more :class:`NumpyMesh` patches + (and/or nested children for ``RepresentationSetRepresentation``). + + Raises: + :exc:`energyml.utils.exception.NotSupportedError`: if the object type + has no registered reader. + """ + if isinstance(energyml_object, list): + # Synthetic container aggregating multiple top-level objects. + synthetic = NumpyMultiMesh(identifier="multi_object_list") + for obj in energyml_object: + synthetic.children.append( + read_numpy_mesh_object( + energyml_object=obj, + workspace=workspace, + use_crs_displacement=use_crs_displacement, + sub_indices=sub_indices, + ) + ) + return synthetic + + type_name = _numpy_mesh_name_mapping(type(energyml_object).__name__) + reader_func = get_numpy_reader_function(type_name) + + if reader_func is None: + from energyml.utils.exception import NotSupportedError as _NSE + + raise _NSE( + f"No numpy mesh reader found for type '{type_name}'. " + f"Expected function 'read_numpy_{snake_case(type_name)}' in {__name__}." + ) + + result: NumpyMultiMesh = reader_func( + energyml_object=energyml_object, + workspace=workspace, + sub_indices=sub_indices, + use_crs_displacement=use_crs_displacement, + ) + + # Apply fallback CRS displacement for readers that do NOT handle it + # internally (e.g. Grid2d which has no per-patch CRS apply call yet). + _tn = type_name.lower() + if ( + use_crs_displacement + and "wellbore" not in _tn + and "triangulated" not in _tn # per-patch CRS applied inside reader + and "point" not in _tn # per-patch CRS applied inside reader + and "polyline" not in _tn # per-patch CRS applied inside reader + and "representationset" not in _tn # each child already had CRS applied + and "subrepresentation" not in _tn # delegates entirely to inner call + and "planeset" not in _tn # per-patch CRS applied inside reader + and "seismicwellbore" not in _tn # delegates to wellbore reader + and "sealedsurface" not in _tn # delegates to representation-set reader + and "unstructuredgrid" not in _tn # per-patch CRS applied inside reader + and "ijkgrid" not in _tn # per-patch CRS applied inside reader + ): + for m in result.flat_patches(): + crs = m.crs_object[0] if isinstance(m.crs_object, list) and m.crs_object else m.crs_object + if crs is not None and len(m.points) > 0: + crs_displacement_np(m.points, crs, inplace=True) + + return result + + +# --------------------------------------------------------------------------- +# PyVista converters +# --------------------------------------------------------------------------- + + +def numpy_mesh_to_pyvista(mesh: NumpyMesh) -> Any: + """Convert a :class:`NumpyMesh` to the appropriate PyVista dataset. + + Connectivity arrays are passed **without copying** when pyvista accepts + them directly (which it does for properly formatted VTK flat arrays). + + Requires ``pyvista`` to be installed (``pip install pyvista``). When + pyvista is absent a helpful :exc:`ImportError` is raised rather than a + silent failure. + + Mapping: + * :class:`NumpyPointSetMesh` → ``pyvista.PolyData(points)`` + * :class:`NumpyPolylineMesh` → ``pyvista.PolyData(points, lines=lines)`` + * :class:`NumpySurfaceMesh` → ``pyvista.PolyData(points, faces=faces)`` + * :class:`NumpyVolumeMesh` → ``pyvista.UnstructuredGrid(cells, cell_types, points)`` + """ + try: + import pyvista as pv # type: ignore[import] + except ImportError as exc: + raise ImportError("pyvista is not installed. " "Install it with: pip install pyvista") from exc + + pts = mesh.points # (N, 3) float64 — no copy + + if isinstance(mesh, NumpyVolumeMesh): + return pv.UnstructuredGrid(mesh.cells, mesh.cell_types, pts) + if isinstance(mesh, NumpySurfaceMesh): + return pv.PolyData(pts, faces=mesh.faces) + if isinstance(mesh, NumpyPolylineMesh): + return pv.PolyData(pts, lines=mesh.lines) + if isinstance(mesh, NumpyPointSetMesh): + return pv.PolyData(pts) + + # Generic fallback: just export points + logging.warning(f"numpy_mesh_to_pyvista: unknown mesh type {type(mesh).__name__}, exporting points only.") + return pv.PolyData(pts) + + +def numpy_multi_mesh_to_pyvista(multi: "NumpyMultiMesh") -> Any: + """Convert a :class:`NumpyMultiMesh` to a ``pyvista.MultiBlock``. + + The resulting ``MultiBlock`` mirrors the two-level hierarchy of + :class:`NumpyMultiMesh`: + + * Child containers (e.g. ``RepresentationSetRepresentation`` members) become + nested ``MultiBlock`` blocks, keyed by their ``identifier``. + * Direct patches become leaf ``PolyData`` / ``UnstructuredGrid`` blocks, + keyed by ``patch_label`` or ``"patch_{patch_index}"``. + + Requires ``pyvista`` to be installed (``pip install pyvista``). + """ + try: + import pyvista as pv # type: ignore[import] + except ImportError as exc: + raise ImportError("pyvista is not installed. Install it with: pip install pyvista") from exc + + block: pv.MultiBlock = pv.MultiBlock() + for child in multi.children: + block.append(numpy_multi_mesh_to_pyvista(child), child.identifier or "child") + for patch in multi.patches: + ds = numpy_mesh_to_pyvista(patch) + if ds is not None: + name = patch.patch_label or (f"patch_{patch.patch_index}" if patch.patch_index is not None else "patch") + block.append(ds, name) + return block + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +__all__ = [ + # Dataclasses + "NumpyMesh", + "NumpyPointSetMesh", + "NumpyPolylineMesh", + "NumpySurfaceMesh", + "NumpyVolumeMesh", + "NumpyMultiMesh", + # CRS + "crs_displacement_np", + # Readers + "read_numpy_mesh_object", + "read_numpy_point_representation", + "read_numpy_polyline_representation", + "read_numpy_triangulated_set_representation", + "read_numpy_grid2d_representation", + "read_numpy_wellbore_trajectory_representation", + "read_numpy_wellbore_frame_representation", + "read_numpy_sub_representation", + "read_numpy_representation_set_representation", + "read_numpy_plane_set_representation", + "read_numpy_seismic_wellbore_frame_representation", + "read_numpy_sealed_surface_framework_representation", + "read_numpy_ijk_grid_representation", + "read_numpy_unstructured_grid_representation", + # Converters + "numpy_mesh_to_pyvista", + "numpy_multi_mesh_to_pyvista", +] diff --git a/energyml-utils/src/energyml/utils/data/model.py b/energyml-utils/src/energyml/utils/data/model.py index e798ce8..cbfdfff 100644 --- a/energyml-utils/src/energyml/utils/data/model.py +++ b/energyml-utils/src/energyml/utils/data/model.py @@ -1,8 +1,12 @@ # Copyright (c) 2023-2024 Geosiris. # SPDX-License-Identifier: Apache-2.0 +import logging +import os +from abc import ABC, abstractmethod from dataclasses import dataclass +from collections import OrderedDict from io import BytesIO -from typing import Optional, List, Union +from typing import Dict, Optional, List, Union, Any import numpy as np @@ -16,6 +20,305 @@ def get_array_dimension(self, source: Union[BytesIO, str], path_in_external_file return None +class FileCacheManager: + """ + Manages a cache of open file handles to avoid reopening overhead. + + Keeps up to `max_open_files` (default 3) files open using an LRU strategy. + When a file is accessed, it moves to the front of the cache. When the cache + is full, the least recently used file is closed and removed. + + Features: + - Thread-safe access to file handles + - Automatic cleanup of least-recently-used files + - Support for any file type with proper handlers + - Explicit close() method for cleanup + """ + + def __init__(self, max_open_files: int = 3): + """ + Initialize file cache manager. + + Args: + max_open_files: Maximum number of files to keep open simultaneously + """ + self.max_open_files = max_open_files + # file_path -> (file handle, mode) + self._cache: OrderedDict[str, tuple[Any, str]] = OrderedDict() + self._handlers: Dict[str, "ExternalArrayHandler"] = {} # file_path -> handler instance + + def get_or_open(self, file_path: str, handler: "ExternalArrayHandler", mode: str = "r") -> Optional[Any]: + """ + Get an open file handle from cache, or open it if not cached. + + Args: + file_path: Path to the file + handler: Handler instance that knows how to open this file type + mode: File open mode ('r', 'a', etc.) + + Returns: + Open file handle, or None if opening failed + """ + # Normalize path + file_path = os.path.abspath(file_path) if os.path.exists(file_path) else file_path + + # Check cache first, and validate mode compatibility + if file_path in self._cache: + cached_handle, cached_mode = self._cache[file_path] + # If requested mode is compatible with cached mode, reuse + if self._is_mode_compatible(cached_mode, mode): + self._cache.move_to_end(file_path) + return cached_handle + # Otherwise, close and reopen with new mode + # logging.debug(f"Mode change for cached file {file_path}: {cached_mode} -> {mode}. Reopening.") + try: + if hasattr(cached_handle, "close"): + cached_handle.close() + except Exception as e: + logging.debug(f"Error closing cached file {file_path}: {e}") + del self._cache[file_path] + if file_path in self._handlers: + del self._handlers[file_path] + + # Not in cache - try to open it + try: + file_handle = handler.open_file_no_cache(file_path, mode) + if file_handle is None: + return None + + # Add to cache with mode + self._cache[file_path] = (file_handle, mode) + self._handlers[file_path] = handler + self._cache.move_to_end(file_path) + + # Evict oldest if cache is full + if len(self._cache) > self.max_open_files: + self._evict_oldest() + + return file_handle + + except Exception as e: + logging.debug(f"Failed to open file {file_path}: {e}") + return None + + def _evict_oldest(self) -> None: + """Remove the least recently used file from cache.""" + if not self._cache: + return + + # Get oldest (first) item + oldest_path, (oldest_handle, _) = self._cache.popitem(last=False) + + # Close the file handle + try: + if hasattr(oldest_handle, "close"): + oldest_handle.close() + except Exception as e: + logging.debug(f"Error closing cached file {oldest_path}: {e}") + + # Remove handler reference + if oldest_path in self._handlers: + del self._handlers[oldest_path] + + def close_all(self) -> None: + """Close all cached file handles.""" + for file_path, (file_handle, _) in list(self._cache.items()): + try: + if hasattr(file_handle, "close"): + file_handle.close() + except Exception as e: + logging.debug(f"Error closing file {file_path}: {e}") + + self._cache.clear() + self._handlers.clear() + + def remove(self, file_path: str) -> None: + """ + Remove a specific file from cache and close it. + + Args: + file_path: Path to the file to remove + """ + file_path = os.path.abspath(file_path) if os.path.exists(file_path) else file_path + + if file_path in self._cache: + file_handle, _ = self._cache.pop(file_path) + try: + if hasattr(file_handle, "close"): + file_handle.close() + except Exception as e: + logging.debug(f"Error closing file {file_path}: {e}") + + if file_path in self._handlers: + del self._handlers[file_path] + + def __len__(self) -> int: + """Return number of cached files.""" + return len(self._cache) + + def __contains__(self, file_path: str) -> bool: + """Check if a file is in cache.""" + file_path = os.path.abspath(file_path) if os.path.exists(file_path) else file_path + return file_path in self._cache + + @staticmethod + def _is_mode_compatible(cached_mode: str, requested_mode: str) -> bool: + """ + Determine if the cached file mode is compatible with the requested mode. + 'r' is only compatible with 'r'. 'r+' and 'a' are compatible with each other and with 'r+'. + 'w' is never compatible (always destructive). + """ + # Simplified: treat 'r' as readonly, 'r+', 'a' as read/write, 'w' as destructive + readonly_modes = {"r"} + rw_modes = {"r+", "a"} + destructive_modes = {"w", "w+", "x"} + + # logging.debug(f"Checking mode compatibility: cached_mode={cached_mode}, requested_mode={requested_mode}") + + result = False + + if cached_mode in destructive_modes or requested_mode in destructive_modes: + result = False + if cached_mode in readonly_modes and requested_mode in readonly_modes: + result = True + if cached_mode in rw_modes and (requested_mode in rw_modes or requested_mode in readonly_modes): + result = True + + # logging.debug(f"\tMode compatibility result: {result}") + + return result + + +class ExternalArrayHandler(ABC): + """ + Base class for handling external array storage (HDF5, Parquet, CSV, etc.). + + This abstract interface defines the contract for reading, writing, and querying + metadata from external array files. Implementations for specific formats extend + this class and handle format-specific details. + + Key features: + - Format-agnostic interface + - Support for file paths, BytesIO, or already-opened file handles + - Metadata queries without loading full arrays + - Support for sub-array selection via start_indices and counts (RESQML v2.2) + """ + + def __init__(self, max_open_files: int = 3): + self.file_cache = FileCacheManager(max_open_files=max_open_files) + + @abstractmethod + def read_array( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[np.ndarray]: + """ + Read array data from external storage with optional sub-selection. + + Args: + source: File path, BytesIO, or already-opened file handle + path_in_external_file: Path/dataset name within the file (format-specific) + start_indices: Optional start index for each dimension (RESQML v2.2 StartIndex) + counts: Optional count of elements for each dimension (RESQML v2.2 Count) + + Returns: + Numpy array if successful, None otherwise. If start_indices and counts are + provided, returns the sub-selected portion of the array. + """ + pass + + @abstractmethod + def write_array( + self, + target: Union[str, BytesIO, Any], + array: Union[list, np.ndarray], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + **kwargs, + ) -> bool: + """ + Write array data to external storage with optional offset. + + Args: + target: File path, BytesIO, or already-opened file handle + array: Data to write + path_in_external_file: Path/dataset name within the file (format-specific) + start_indices: Optional start index for each dimension for partial writes + **kwargs: Additional format-specific parameters + + Returns: + True if successful, False otherwise + """ + pass + + @abstractmethod + def get_array_metadata( + self, + source: Union[BytesIO, str, Any], + path_in_external_file: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Optional[Union[dict, List[dict]]]: + """ + Get metadata about arrays in external storage without loading the data. + + Args: + source: File path, BytesIO, or already-opened file handle + path_in_external_file: Specific array path, or None to get all arrays + start_indices: Optional start index for each dimension + counts: Optional count of elements for each dimension + + Returns: + Dict with keys: 'path', 'dtype', 'shape', 'size' for single array. + If start_indices and counts provided, 'shape' reflects the sub-selection. + List of such dicts if path_in_external_file is None. + None if not found or error. + """ + pass + + @abstractmethod + def list_arrays(self, source: Union[BytesIO, str, Any]) -> List[str]: + """ + List all array paths/dataset names in the external file. + + Args: + source: File path, BytesIO, or already-opened file handle + + Returns: + List of array path strings + """ + pass + + @abstractmethod + def can_handle_file(self, file_path: str) -> bool: + """ + Check if this handler can process the given file based on extension. + + Args: + file_path: Path to the file + + Returns: + True if this handler supports the file format + """ + pass + + @abstractmethod + def open_file_no_cache(self, file_path: str, mode: str = "r") -> Optional[Any]: + """ + Open a file without using the cache. This is for handlers that manage their own file handles. + + Args: + file_path: Path to the file + mode: File open mode + Returns: + Open file handle, or None if opening failed + """ + pass + + # @dataclass # class ETPReader(DatasetReader): # def read_array(self, obj_uri: str, path_in_external_file: str) -> Optional[np.ndarray]: diff --git a/energyml-utils/src/energyml/utils/data/representation_context.py b/energyml-utils/src/energyml/utils/data/representation_context.py new file mode 100644 index 0000000..322b86d --- /dev/null +++ b/energyml-utils/src/energyml/utils/data/representation_context.py @@ -0,0 +1,281 @@ +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 + +import logging +from typing import Any, Dict, List, Optional +from energyml.opc.opc import Relationship +from pydantic import BaseModel, Field, ConfigDict + +from energyml.utils.uri import Uri +from energyml.utils.storage_interface import EnergymlStorageInterface +from energyml.utils.epc_utils import extract_uuid_and_version_from_obj_path +from energyml.utils.introspection import get_obj_uri, get_obj_uuid, search_attribute_matching_name +from energyml.utils.data.helper import RgbaColor, ScalarRenderingInfo, read_graphical_rendering_info + +NO_KIND = "NO_KIND" + + +class RepresentationContext(BaseModel): + + model_config = ConfigDict(arbitrary_types_allowed=True) + + obj: Any = Field(...) + workspace: EnergymlStorageInterface = Field(...) + uri: Uri = Field(default="") + + crs: List[Any] = Field(default_factory=list) + rels: List[Relationship] = Field(default_factory=list) + + # Properties keyed by object uuid → property object + properties_by_kind: dict = Field(default_factory=dict) + + # Graphical information keyed by GraphicalInformationSet uri → list of entries + graphical_info: dict = Field(default_factory=dict) + + time_series: list = Field(default_factory=list) + + def __init__(self, obj: Any, workspace: EnergymlStorageInterface, **data): + super().__init__(obj=obj, workspace=workspace, uri=get_obj_uri(obj), **data) + self.update() + + def update(self): + self.rels = self.workspace.get_obj_rels(self.obj) + self._collect_properties(self.rels) + self._collect_crs() + self._collect_graphical_info(self.rels) + self.collect_time_series() + + def collect_time_series(self): + self.time_series = [] + time_series_dors = search_attribute_matching_name(self.obj, r"time_series") + if time_series_dors is not None: + for ts_dor in time_series_dors: + ts_obj = self.workspace.get_object(get_obj_uri(ts_dor)) + if ts_obj is not None: + self.time_series.append(ts_obj) + else: + logging.warning(f"TimeSeries {get_obj_uri(ts_dor)} not found in workspace") + + def _collect_properties(self, rels: List[Relationship]): + # Collect related properties keyed by property uuid + self.properties_by_kind = {} + for r in self.rels: + if "Property" in r.target: + uuid, version = extract_uuid_and_version_from_obj_path(r.target) + prop = self.workspace.get_object_by_uuid_versioned(uuid, version) + if prop is None: + logging.warning(f"Property {r.target} not found in workspace") + continue + prop_uuid = getattr(prop, "uuid", NO_KIND) + self.properties_by_kind[prop_uuid] = prop + + def _collect_crs(self): + # Collect related CRS objects referenced by the representation + self.crs = [] + crs_dors = search_attribute_matching_name(self.obj, r"\.*Crs", search_in_sub_obj=True, deep_search=False) + if crs_dors is not None and len(crs_dors) > 0: + for crs_ref in crs_dors: + if crs_ref is not None: + crs = self.workspace.get_object(get_obj_uri(crs_ref)) + if crs is not None: + self.crs.append(crs) + else: + logging.warning(f"CRS {get_obj_uri(crs_ref)} not found in workspace") + + def _collect_graphical_info(self, rels: List[Relationship]): + # Collect graphical information entries whose target matches this representation + self.graphical_info = {} + for r in self.rels: + if "GraphicalInformationSet" in r.target: + uuid, version = extract_uuid_and_version_from_obj_path(r.target) + graphical_info_set = self.workspace.get_object_by_uuid_versioned(uuid, version) + if graphical_info_set is None: + logging.warning(f"GraphicalInformationSet {r.target} not found in workspace") + continue + graphical_info_set_uri = get_obj_uri(graphical_info_set) + for graphical_info in getattr(graphical_info_set, "graphical_information", []): + target_dors = getattr(graphical_info, "target_object", None) + if target_dors is not None: + if not isinstance(target_dors, list): + target_dors = [target_dors] + for target_dor in target_dors: + target_dor_uuid = get_obj_uuid(target_dor) + if target_dor_uuid == self.uri.uuid: + if graphical_info_set_uri not in self.graphical_info: + self.graphical_info[graphical_info_set_uri] = [] + self.graphical_info[graphical_info_set_uri].append(graphical_info) + break + + def get_default_color(self) -> ScalarRenderingInfo: + """Search for a default color (first found) for the representation, and return it as an RGBA tuple. Returns a random color (generated from uuid) if no color information is found.""" + for gis_uri, entries in self.graphical_info.items(): + for entry in entries: + try: + rendering_info = read_graphical_rendering_info(entry, self.workspace) + if rendering_info is not None: + return rendering_info + except Exception as exc: + logging.debug(f"Error reading graphical rendering info for entry {entry}: {exc}") + # No color information found, generate a random color from uuid + return ScalarRenderingInfo(constant_color=RgbaColor.from_uuid(self.uri.uuid)) + + def get_property(self, property_uuid: str) -> Optional[Any]: + """Return the property object with the given uuid, or None.""" + return self.properties_by_kind.get(property_uuid) + + def get_properties_time_series(self, property_uuid: str) -> Dict[str, List[Any]]: + """ + Return a time-indexed dict {time_step_str: [property_values, ...]} for + the given property uuid. Returns an empty dict when the property has no + time series reference. + """ + from energyml.utils.data.mesh import read_time_series, read_property + + prop = self.get_property(property_uuid) + if prop is None: + logging.warning(f"Property {property_uuid} not found in context") + return {} + + time_series_dor = search_attribute_matching_name(prop, r"TimeSeries") + if not time_series_dor: + return {} + + ts_obj = self.workspace.get_object(get_obj_uri(time_series_dor[0])) + if ts_obj is None: + return {} + + steps = read_time_series(ts_obj, self.workspace) + values = read_property(prop, self.workspace) + + result: Dict[str, List[Any]] = {} + for step_idx, dt in steps: + result[str(dt)] = values[step_idx] if step_idx < len(values) else [] + return result + + def seach_same_representation_in_other_time_step(self) -> List[Uri]: + """Search for another representation that has the same interpretation, and same TimeSeries reference (if any), but different time step.""" + if self.time_series is None or len(self.time_series) == 0: + logging.debug( + f"Representation {self.uri} has no TimeSeries reference, skipping search for same representation in other time step" + ) + return [] + interpretation_dor = getattr(self.obj, "represented_interpretation", None) + if interpretation_dor is None: + return None + + obj_time_series_uuids = {get_obj_uuid(ts) for ts in self.time_series} + + similar_representations = [] + + interp_rels = self.workspace.get_obj_rels(get_obj_uri(interpretation_dor)) + for r in interp_rels: + if self.uri.object_type in r.target and self.uri.uuid not in r.target: + candidate_uuid, candidate_version = extract_uuid_and_version_from_obj_path(r.target) + candidate = self.workspace.get_object_by_uuid_versioned(candidate_uuid, candidate_version) + + if candidate is not None: + candidate_time_series_dor = search_attribute_matching_name(candidate, r"time_series") + candidate_time_series_uuids = ( + {get_obj_uuid(ts) for ts in candidate_time_series_dor} if candidate_time_series_dor else set() + ) + # search if at least one of the TimeSeries references is the same between the candidate and the current representation + if len(obj_time_series_uuids.intersection(candidate_time_series_uuids)) > 0: + similar_representations.append(get_obj_uri(candidate)) + + return similar_representations + + def dump(self) -> str: + """Return a human-readable summary of the context for debugging.""" + lines: List[str] = [] + lines.append("=" * 60) + lines.append(f"RepresentationContext") + lines.append(f" URI : {self.uri}") + lines.append(f" Type : {type(self.obj).__name__}") + lines.append("") + + lines.append(f" CRS ({len(self.crs)}):") + for c in self.crs: + lines.append(f" - {type(c).__name__} {get_obj_uri(c)}") + + lines.append("") + lines.append(f" Relationships ({len(self.rels)}):") + for r in self.rels: + lines.append(f" - [{r.type_ if hasattr(r, 'type_') else getattr(r, 'type', '?')}] {r.target}") + + lines.append("") + lines.append(f" Properties ({len(self.properties_by_kind)}):") + for uuid, prop in self.properties_by_kind.items(): + kind = getattr(prop, "property_kind", "?") + lines.append(f" - {type(prop).__name__} uuid={uuid} kind={kind}") + + lines.append("") + lines.append(f" Graphical info ({len(self.graphical_info)} set(s)):") + for uri, entries in self.graphical_info.items(): + lines.append(f" - Set {uri} ({len(entries)} entr{'y' if len(entries)==1 else 'ies'})") + + lines.append("=" * 60) + return "\n".join(lines) + + +if __name__ == "__main__": + import sys + + logging.basicConfig(level=logging.WARNING, stream=sys.stdout) + + epc_path = "rc/epc/testingPackageCpp22.epc" + representation_uri = "df2103a0-fa3d-11e5-b8d4-0002a5d5c51b." + # representation_uri = "eml:///resqml20.obj_Grid2dRepresentation(030a82f6-10a7-4ecf-af03-54749e098624)" + + from energyml.utils.epc import Epc + + epc = Epc.read_file(epc_path) + workspace = epc # Epc extends EnergymlStorageInterface directly + + repr_obj = workspace.get_object(representation_uri) + if repr_obj is None: + print(f"ERROR: object not found for URI {representation_uri}") + sys.exit(1) + + repr_ctx = RepresentationContext(repr_obj, workspace) + + # --- dump of values --- + print(repr_ctx.dump()) + + # Detail: CRS info + if repr_ctx.crs: + from energyml.utils.data.crs import extract_crs_info + + print("\nCRS details:") + for c in repr_ctx.crs: + info = extract_crs_info(c, workspace) + print(f" {type(c).__name__}:") + print(f" x_offset={info.x_offset}, y_offset={info.y_offset}, z_offset={info.z_offset}") + print(f" z_increasing_downward={info.z_increasing_downward}") + print(f" projected_epsg={info.projected_epsg_code}, vertical_epsg={info.vertical_epsg_code}") + print(f" areal_rotation={info.areal_rotation_value} {info.areal_rotation_uom}") + print(f" axis_order={info.projected_axis_order}") + + # Detail: property arrays (truncated) + if repr_ctx.properties_by_kind: + from energyml.utils.data.mesh import read_property + + print("\nProperty arrays (first 10 values):") + for uuid, prop in repr_ctx.properties_by_kind.items(): + try: + arr = read_property(prop, workspace) + print(f" {type(prop).__name__} [{uuid}]: shape={getattr(arr, 'shape', len(arr))} sample={arr[:10]}") + except Exception as exc: + print(f" {type(prop).__name__} [{uuid}]: ERROR reading — {exc}") + + # print property time series values + if repr_ctx.properties_by_kind: + print("\nProperty time series values:") + for uuid, prop in repr_ctx.properties_by_kind.items(): + try: + ts_values = repr_ctx.get_properties_time_series(uuid) + if ts_values: + print(f" {type(prop).__name__} [{uuid}]:") + for time_step, values in ts_values.items(): + print(f" - Time {time_step}: sample={values[:10]}") + except Exception as exc: + print(f" {type(prop).__name__} [{uuid}]: ERROR reading time series — {exc}") diff --git a/energyml-utils/src/energyml/utils/epc.py b/energyml-utils/src/energyml/utils/epc.py index e44fe22..c68dcd1 100644 --- a/energyml-utils/src/energyml/utils/epc.py +++ b/energyml-utils/src/energyml/utils/epc.py @@ -5,82 +5,51 @@ """ import datetime -import json +import threading import logging import os from pathlib import Path import random -import re +import time import traceback import zipfile from dataclasses import dataclass, field +from functools import wraps from io import BytesIO -from typing import List, Any, Union, Dict, Callable, Optional, Tuple - -from energyml.opc.opc import ( - CoreProperties, - Relationships, - Types, - Default, - Relationship, - Override, - Created, - Creator, - Identifier, - Keywords1, - TargetMode, -) -from energyml.utils.storage_interface import DataArrayMetadata, EnergymlStorageInterface, ResourceMetadata +from typing import List, Any, Set, Tuple, Union, Dict, Optional import numpy as np -from .uri import Uri, parse_uri + +from enum import Enum from xsdata.formats.dataclass.models.generics import DerivedElement -from .constants import ( - RELS_CONTENT_TYPE, - RELS_FOLDER_NAME, +from energyml.opc.opc import CoreProperties, Relationships, Types, Relationship, Override +from energyml.utils.storage_interface import DataArrayMetadata, EnergymlStorageInterface, ResourceMetadata +from energyml.utils.uri import Uri, parse_uri + +from energyml.utils.constants import ( EpcExportVersion, RawFile, EPCRelsRelationshipType, - MimeType, - content_type_to_qualified_type, - qualified_type_to_content_type, - split_identifier, - get_property_kind_dict_path_as_dict, - OptimizedRegex, ) -from .data.datasets_io import ( - HDF5FileReader, - HDF5FileWriter, +from energyml.utils.data.datasets_io import ( + get_handler_registry, read_external_dataset_array, ) -from .exception import UnparsableFile -from .introspection import ( +from energyml.utils.exception import UnparsableFile +from energyml.utils.introspection import ( get_class_from_content_type, - get_dor_obj_info, get_obj_type, get_obj_uri, get_obj_usable_class, - is_dor, - search_attribute_matching_type, get_obj_version, get_obj_uuid, - get_object_type_for_file_path_from_class, get_content_type_from_class, - get_direct_dor_list, - epoch_to_date, - epoch, gen_uuid, get_obj_identifier, - get_class_from_qualified_type, - copy_attributes, - get_obj_attribute_class, - set_attribute_from_path, - set_attribute_value, get_object_attribute, get_qualified_type_from_class, ) -from .manager import get_class_pkg, get_class_pkg_version -from .serialization import ( +from energyml.utils.serialization import ( serialize_xml, read_energyml_xml_str, read_energyml_xml_bytes, @@ -88,13 +57,693 @@ read_energyml_json_bytes, JSON_VERSION, ) -from .xml import is_energyml_content_type +from energyml.utils.xml_utils import is_energyml_content_type +from energyml.utils.epc_utils import ( + gen_core_props_path, + gen_energyml_object_path, + gen_rels_path, + get_epc_content_type_path, + create_h5_external_relationship, + get_file_folder, + make_path_relative_to_other_file, + make_path_relative_to_filepath_list, + as_identifier, +) + + +class EnergymlObjectCollection: + """ + A collection that maintains both list semantics (for backward compatibility) + and dict-based lookups (for O(1) performance) for energyml objects. + + This allows existing code using .append() to work while providing efficient + get_object_by_identifier() and get_object_by_uuid() operations. + """ + + def __init__(self, objects: Optional[List[Any]] = None): + self._by_identifier: Dict[str, Any] = {} + self._by_uri: Dict[str, Any] = {} + self._by_uuid: Dict[str, List[Any]] = {} + self._objects_list: List[Any] = [] + + if objects: + for obj in objects: + self.append(obj) + + def append(self, obj: Any) -> None: + """Add an object to the collection (list-compatible method).""" + identifier = get_obj_identifier(obj) + uri = str(get_obj_uri(obj)) + uuid = get_obj_uuid(obj) + + # Check if object already exists by identifier + if identifier in self._by_identifier: + # Replace existing object + existing = self._by_identifier[identifier] + idx = self._objects_list.index(existing) + self._objects_list[idx] = obj + + # Clean up old URI mapping + old_uri = str(get_obj_uri(existing)) + if old_uri in self._by_uri: + del self._by_uri[old_uri] + + # Clean up old UUID mapping + old_uuid = get_obj_uuid(existing) + if old_uuid in self._by_uuid and existing in self._by_uuid[old_uuid]: + self._by_uuid[old_uuid].remove(existing) + if not self._by_uuid[old_uuid]: + del self._by_uuid[old_uuid] + else: + # Add new object + self._objects_list.append(obj) + + # Update all indices + self._by_identifier[identifier] = obj + self._by_uri[uri] = obj + + if uuid not in self._by_uuid: + self._by_uuid[uuid] = [] + if obj not in self._by_uuid[uuid]: + self._by_uuid[uuid].append(obj) + + def remove(self, obj: Any) -> None: + """Remove an object from the collection (list-compatible method).""" + identifier = get_obj_identifier(obj) + + if identifier in self._by_identifier: + stored_obj = self._by_identifier[identifier] + self._objects_list.remove(stored_obj) + + # Clean up all indices + del self._by_identifier[identifier] + + uri = str(get_obj_uri(stored_obj)) + if uri in self._by_uri: + del self._by_uri[uri] + + uuid = get_obj_uuid(stored_obj) + if uuid in self._by_uuid and stored_obj in self._by_uuid[uuid]: + self._by_uuid[uuid].remove(stored_obj) + if not self._by_uuid[uuid]: + del self._by_uuid[uuid] + + def get_by_identifier(self, identifier: Union[str, Uri]) -> Optional[Any]: + """Get object by identifier (O(1) lookup).""" + # Try identifier lookup first + # obj = self._by_identifier.get(str(identifier)) + obj = self._by_identifier.get(as_identifier(identifier)) + if obj is not None: + return obj + + # Try URI lookup + return self._by_uri.get(str(identifier)) + + def get_by_uuid(self, uuid: str) -> List[Any]: + """Get all objects with this UUID (O(1) lookup).""" + return self._by_uuid.get(uuid, []) + + def __iter__(self): + """Iterate over objects in insertion order.""" + return iter(self._objects_list) + + def __len__(self) -> int: + """Get number of objects.""" + return len(self._objects_list) + + def __getitem__(self, index: int) -> Any: + """Support indexing (e.g., energyml_objects[0]).""" + return self._objects_list[index] + + def __bool__(self) -> bool: + """Support boolean checks (e.g., if energyml_objects:).""" + return len(self._objects_list) > 0 + + +class EpcRelsCacheErrorPolicy(Enum): + LOG = "log" + RAISE = "raise" + SKIP = "skip" + + +class EpcRelsCache: + """ + EPC Relationships Cache Manager + + Summary + ------- + Manages in-memory relationships between EPC objects, using canonical Uri as the internal key. + Accepts identifier, Uri, str(Uri), or the object itself as input for all public methods. + Does not manage rels file paths; export logic is handled by the Epc class. + + API Reference + ------------- + - __init__(epc: Epc | EnergymlObjectCollection, export_version, error_policy='log') + Initialize with a reference to the owning Epc or a collection of objects. + Optionally set error handling policy ('log', 'raise', 'skip'). + + - set_rels_from_file(obj: Union[str, Uri, Any], rels: Relationships) -> None + Attach relationships loaded from a .rels file to the given object (by any accepted key type). + Used for supplemental or precomputed rels. + + - add_supplemental_rels(obj: Union[str, Uri, Any], rels: Union[Relationship, List[Relationship]]) -> None + Add supplemental relationships for an object. These persist across cache clears and are merged lazily. + + - get_object_rels(obj: Union[str, Uri, Any]) -> List[Relationship] + Return the effective relationships for an object, merging computed and supplemental rels, deduplicated. + + - get_object_relationships(obj: Union[str, Uri, Any]) -> Relationships + Return RelationShips(get_object_rels(obj)) for the given object. + + - compute_rels(parallel: bool = False, recompute_all: bool = False) -> Dict[Uri, List[Relationship]] + Recompute all relationships. If parallel=True, use a thread/process pool for the map phase. + Returns a mapping of Uri to deduplicated relationships. Export logic (rels path, target) is handled by Epc. + + - update_cache_for_object(obj: Union[str, Uri, Any]) -> None + Incrementally update relationships for a single object add, remove, or modification. + + - clear_cache() and recompute_cache(parallel=False) + Clear or fully recompute the internal cache. + + - clean_rels(obj: Union[str, Uri, Any] = None) -> None + Deduplicate relationships for a given object or all objects. Called after full recompute. + + - validate_rels() -> Dict[str, Any] + Run validation checks: duplicate rels, missing reverse links, circular references, etc. + Returns a report of issues found. + + Implementation Notes + ------------------- + - All public methods accept identifier, Uri, str(Uri), or object; internally, always convert to Uri with a specific function to avoid code duplication. + - Internal caches: {Uri: List[Relationship]} for computed, {Uri: List[Relationship]} for supplemental, {Uri: Set[Uri]} for reverse index (target -> sources). + - Reverse index enables O(1) lookup of which objects reference a given target, critical for incremental updates. + - No rels path management; Epc class is responsible for rels file path and target attribute generation. + - Relationship IDs must be deterministic (e.g., UUIDv5 or hash of source+target+type). + - On exception, log/skip/raise according to error_policy. Omitted objects do not block the pipeline. + - clean_rels() can be parallelized, as deduplication is per-object. + - Use threading.Lock or RLock to protect cache updates. Only lock during writes. + + Behavioural Invariants + --------------------- + - Canonical in-memory key: Uri. Never mix identifier and Uri in the same map. + - Supplemental rels are preserved and merged lazily; not lost on clear/recompute unless explicitly removed. + - All deduplication and validation is performed on the in-memory Uri-keyed data. + + Validation & Testing + ------------------- + - clean_rels() ensures no duplicate (type, target) relationships per object. + - validate_rels() checks for missing reverse links, circular references, and other edge cases. + - Unit tests should cover all input types, deduplication, error handling, and validation. + - Use EnergymlObjectCollection for initial tests. + + Migration/Integration + -------------------- + - This class is standalone. Once implemented and tested, integrate into Epc, replacing legacy rels handling. + - No migration needed until integration. + + """ + + def __init__(self, epc_or_collection, export_version=None, error_policy=EpcRelsCacheErrorPolicy.LOG): + """ + Initialize the EpcRelsCache. + :param epc_or_collection: Epc instance or EnergymlObjectCollection + :param export_version: EPC export version. If None and epc_or_collection is Epc, uses epc.export_version + :param export_version: EPC export version (for rels path/target generation) + :param error_policy: EpcRelsCacheErrorPolicy enum value for error handling + """ + self._lock = threading.RLock() + if isinstance(error_policy, str): + # Allow legacy string for backward compatibility + error_policy = EpcRelsCacheErrorPolicy(error_policy.lower()) + self._error_policy = error_policy + # Accept Epc or EnergymlObjectCollection + if isinstance(epc_or_collection, Epc): + self._objects = epc_or_collection.energyml_objects + self._epc = epc_or_collection + self._export_version_fallback = export_version or epc_or_collection.export_version + else: + self._objects = epc_or_collection + self._epc = None + self._export_version_fallback = export_version or EpcExportVersion.CLASSIC + # Internal caches + self._computed_rels = {} # {Uri: List[Relationship]} + self._supplemental_rels = {} # {Uri: List[Relationship]} + self._reverse_index: Dict[Uri, Set[Uri]] = {} # {target_uri: {source_uris}} + + @property + def export_version(self) -> EpcExportVersion: + """Get the current export version, using Epc's version if available.""" + if self._epc is not None: + return self._epc.export_version + return self._export_version_fallback + + def _uri_from_any(self, obj_or_id: Any) -> "Uri": + """ + Normalize input to canonical Uri. + Accepts identifier, Uri, str(Uri), or object. + """ + if isinstance(obj_or_id, Uri): + return obj_or_id + if hasattr(obj_or_id, "object_version") or hasattr(obj_or_id, "__dict__"): + # Likely an energyml object + return get_obj_uri(obj_or_id) + if isinstance(obj_or_id, str): + # Try parse as Uri + uri = parse_uri(obj_or_id) + if uri: + return uri + # Try as identifier + obj = None + if self._epc and hasattr(self._epc, "get_object_by_identifier"): + obj = self._epc.get_object_by_identifier(obj_or_id) + elif hasattr(self._objects, "get_by_identifier"): + obj = self._objects.get_by_identifier(obj_or_id) + if obj: + return get_obj_uri(obj) + raise ValueError(f"Cannot resolve to Uri: {obj_or_id}") + + def set_rels_from_file(self, obj: Any, rels: "Relationships") -> None: + """Attach relationships loaded from a .rels file to the given object.""" + uri = self._uri_from_any(obj) + with self._lock: + self._computed_rels[uri] = list(rels.relationship) if hasattr(rels, "relationship") else list(rels) + + # check supplemental to keep : + for r in rels.relationship or []: + if r.type_value not in ( + str(EPCRelsRelationshipType.DESTINATION_OBJECT), + str(EPCRelsRelationshipType.SOURCE_OBJECT), + str(EPCRelsRelationshipType.ML_TO_EXTERNAL_PART_PROXY), + str(EPCRelsRelationshipType.EXTERNAL_PART_PROXY_TO_ML), + ): + if uri not in self._supplemental_rels: + self._supplemental_rels[uri] = [] + self._supplemental_rels[uri].append(r) + + # self._supplemental_rels[uri] = list(rels.relationship) if hasattr(rels, "relationship") else list(rels) + + def add_supplemental_rels(self, obj: Any, rels: Union["Relationship", List["Relationship"]]) -> None: + """Add supplemental relationships for an object.""" + uri = self._uri_from_any(obj) + with self._lock: + if uri not in self._supplemental_rels: + self._supplemental_rels[uri] = [] + if isinstance(rels, list): + self._supplemental_rels[uri].extend(rels) + else: + self._supplemental_rels[uri].append(rels) + + def get_supplemental_rels(self, obj: Any, default=None) -> List["Relationship"]: + """Get supplemental relationships for an object.""" + uri = self._uri_from_any(obj) + with self._lock: + return self._supplemental_rels.get(uri, default if default is not None else []) + + def get_object_rels(self, obj: Any) -> List["Relationship"]: + """Return the effective relationships for an object, merging computed and supplemental rels, deduplicated.""" + uri = self._uri_from_any(obj) + with self._lock: + rels = list(self._computed_rels.get(uri, [])) + rels.extend(self._supplemental_rels.get(uri, [])) + return self._deduplicate_rels(rels) + + def compute_rels(self, parallel: bool = False) -> Dict["Uri", List["Relationship"]]: + """ + Recompute all relationships, including reverse relationships. If parallel=True, use a thread/process pool for the map phase. + Returns a mapping of Uri to deduplicated relationships. + """ + import collections + import concurrent.futures + + with self._lock: + self._computed_rels.clear() + objects = list(self._objects) + + # First pass: collect direct DORs for each object + def map_func(obj) -> Optional[Tuple[Uri, Set[Uri], Set[Tuple[str, str]]]]: + try: + uri = get_obj_uri(obj) + dor_uris, external_uris = self._get_direct_dor_uris(obj) + return (uri, dor_uris, external_uris) + except Exception as e: + self._handle_error(f"Failed to compute DORs for {obj}: {e}") + return None + + results = [] + if parallel: + with concurrent.futures.ThreadPoolExecutor() as executor: + for res in executor.map(map_func, objects): + if res: + results.append(res) + else: + for obj in objects: + res = map_func(obj) + if res: + results.append(res) + + # Second pass: build forward and reverse relationships + rels_map = collections.defaultdict(list) # {Uri: List[Relationship]} + for src_uri, dor_uris, external_uris in results: + src_path = gen_energyml_object_path(src_uri, export_version=self.export_version) + for tgt_uri in dor_uris: + tgt_path = gen_energyml_object_path(tgt_uri, export_version=self.export_version) + # Forward rel (src -> tgt) + rels_map[src_uri].append( + Relationship( + target=tgt_path, + type_value=get_rels_dor_type(dor_target=tgt_path, in_dor_owner_rels_file=True), + id=f"_{gen_uuid()}", + ) + ) + # Reverse rel (tgt -> src) + rels_map[tgt_uri].append( + Relationship( + target=src_path, + type_value=get_rels_dor_type(dor_target=tgt_path, in_dor_owner_rels_file=False), + id=f"_{gen_uuid()}", + ) + ) + for ext_uri, _ in external_uris: + rels_map[src_uri].append(create_external_relationship(ext_uri)) + + # Build reverse index from results + reverse_idx = collections.defaultdict(set) + for src_uri, dor_uris, external_uris in results: + for tgt_uri in dor_uris: + reverse_idx[tgt_uri].add(src_uri) + + with self._lock: + self._computed_rels = dict(rels_map) + self._reverse_index = {k: v for k, v in reverse_idx.items()} + self.clean_rels() + return {uri: self.get_object_rels(uri) for uri in self._computed_rels} + + def update_cache_for_object(self, obj: Any) -> None: + """Incrementally update relationships for a single object, including reverse relationships.""" + uri = self._uri_from_any(obj) + dor_uris, external_uris = self._get_direct_dor_uris(obj) + + with self._lock: + # Remove old reverse index entries for this object + if uri in self._computed_rels: + # Find old DOR targets and clean them up + old_rels = self._computed_rels.get(uri, []) + for old_rel in old_rels: + # Extract target URI from path (approximate - we'll rebuild from scratch) + pass + + # Clean up old reverse index entries where this object was the source + for tgt_uri, sources in list(self._reverse_index.items()): + if uri in sources: + sources.discard(uri) + if not sources: + del self._reverse_index[tgt_uri] + + # Compute forward relationships for this object + forward_rels = [] + src_path = gen_energyml_object_path(uri, export_version=self.export_version) + + for tgt_uri in dor_uris: + tgt_path = gen_energyml_object_path(tgt_uri, export_version=self.export_version) + # Forward rel (this object -> target) + forward_rels.append( + Relationship( + target=tgt_path, + type_value=get_rels_dor_type(dor_target=tgt_path, in_dor_owner_rels_file=True), + id=f"_{gen_uuid()}", + ) + ) + + # Update reverse index: target is now referenced by this object + if tgt_uri not in self._reverse_index: + self._reverse_index[tgt_uri] = set() + self._reverse_index[tgt_uri].add(uri) + + # Add reverse rel to target if target exists in cache + if tgt_uri in self._computed_rels: + reverse_rel = Relationship( + target=src_path, + type_value=get_rels_dor_type(dor_target=tgt_path, in_dor_owner_rels_file=False), + id=f"_{gen_uuid()}", + ) + self._computed_rels[tgt_uri].append(reverse_rel) + + # Compute reverse relationships from index (who references me?) + reverse_rels = [] + for src_uri in self._reverse_index.get(uri, set()): + if src_uri != uri: # Avoid self-references + src_path = gen_energyml_object_path(src_uri, export_version=self.export_version) + tgt_path = gen_energyml_object_path(uri, export_version=self.export_version) + reverse_rels.append( + Relationship( + target=src_path, + type_value=get_rels_dor_type(dor_target=tgt_path, in_dor_owner_rels_file=False), + id=f"_{gen_uuid()}", + ) + ) + + for ext_uri, _ in external_uris: + forward_rels.append(create_external_relationship(ext_uri)) + + # Store combined relationships + self._computed_rels[uri] = forward_rels + reverse_rels + + def clear_cache(self) -> None: + """Clear the internal caches and reverse index.""" + with self._lock: + self._computed_rels.clear() + self._reverse_index.clear() + + def recompute_cache(self, parallel: bool = False) -> Dict["Uri", List["Relationship"]]: + """Fully recompute the internal cache.""" + return self.compute_rels(parallel=parallel) + + def clean_rels(self, obj: Optional[Any] = None) -> None: + """ + Deduplicate relationships for a given object or all objects. + Removes duplicates by (target, type_value). + """ + with self._lock: + if obj is not None: + uri = self._uri_from_any(obj) + rels = self._computed_rels.get(uri, []) + self._supplemental_rels.get(uri, []) + deduped = self._deduplicate_rels(rels) + self._computed_rels[uri] = deduped + else: + for uri in set(list(self._computed_rels.keys()) + list(self._supplemental_rels.keys())): + rels = self._computed_rels.get(uri, []) + self._supplemental_rels.get(uri, []) + deduped = self._deduplicate_rels(rels) + self._computed_rels[uri] = deduped + + def validate_rels(self) -> Dict[str, Any]: + """ + Run validation checks: duplicate rels, orphaned references, circular references, etc. + Returns a report of issues found. + """ + report = {"duplicates": [], "orphaned_references": [], "circular": [], "index_inconsistency": []} + + with self._lock: + # Check for duplicates + for uri, rels in self._computed_rels.items(): + seen = set() + for rel in rels: + key = (getattr(rel, "target", None), getattr(rel, "type_value", None)) + if key in seen: + report["duplicates"].append((str(uri), key)) + else: + seen.add(key) + + # Check for orphaned references (references to non-existent objects) + all_object_uris = set() + if self._epc: + all_object_uris = {get_obj_uri(obj) for obj in self._epc.energyml_objects} + elif self._objects: + all_object_uris = {get_obj_uri(obj) for obj in self._objects} + + for target_uri, sources in self._reverse_index.items(): + # An object is orphaned if it's referenced but doesn't exist in the collection + # Note: target_uri may be in _computed_rels due to reverse relationships, + # but that doesn't mean the object actually exists in the collection + if target_uri not in all_object_uris: + report["orphaned_references"].append( + {"target": str(target_uri), "referenced_by": [str(s) for s in sources]} + ) + + # Check reverse index consistency + for src_uri, rels in self._computed_rels.items(): + for rel in rels: + # Check if forward relationships are properly indexed + # This is a sanity check for the index + pass + + return report + + def get_reverse_index_stats(self) -> Dict[str, Any]: + """ + Get statistics about the reverse reference index for debugging and validation. + Returns a dictionary with index statistics. + """ + with self._lock: + stats = { + "total_targets": len(self._reverse_index), + "total_references": sum(len(sources) for sources in self._reverse_index.values()), + "max_references_to_single_target": max( + (len(sources) for sources in self._reverse_index.values()), default=0 + ), + "targets_by_reference_count": {}, + } + + # Group targets by how many sources reference them + for target_uri, sources in self._reverse_index.items(): + count = len(sources) + if count not in stats["targets_by_reference_count"]: + stats["targets_by_reference_count"][count] = 0 + stats["targets_by_reference_count"][count] += 1 + + return stats + + def _handle_error(self, msg: str) -> None: + if self._error_policy == EpcRelsCacheErrorPolicy.LOG: + import logging + + logging.error(msg) + elif self._error_policy == EpcRelsCacheErrorPolicy.RAISE: + raise RuntimeError(msg) + # else: SKIP + + def _deduplicate_rels(self, rels: List["Relationship"]) -> List["Relationship"]: + """Remove duplicate relationships by (target, type_value).""" + seen = set() + result = [] + for rel in rels: + key = (getattr(rel, "target", None), getattr(rel, "type_value", None)) + if key not in seen: + seen.add(key) + result.append(rel) + return result + + def _remove_object_from_cache(self, obj: Any) -> None: + """ + Remove an object from the cache, cleaning up all references and reverse index entries. + """ + uri = self._uri_from_any(obj) + + with self._lock: + # Remove from computed rels + if uri in self._computed_rels: + del self._computed_rels[uri] + + # Remove from supplemental rels + if uri in self._supplemental_rels: + del self._supplemental_rels[uri] + + # Remove from reverse index (as target) + if uri in self._reverse_index: + del self._reverse_index[uri] + + # Remove from reverse index (as source) + for target_uri, sources in list(self._reverse_index.items()): + if uri in sources: + sources.discard(uri) + if not sources: + del self._reverse_index[target_uri] + + # Remove reverse rels from targets' computed rels + for other_uri, other_rels in self._computed_rels.items(): + if other_uri != uri: + # Filter out relationships targeting the removed object + uri_path = gen_energyml_object_path(uri, export_version=self.export_version) + self._computed_rels[other_uri] = [ + rel for rel in other_rels if getattr(rel, "target", None) != uri_path + ] + + def _get_direct_dor_uris(self, obj: Any) -> Tuple[Set[Uri], Set[Tuple[str, str]]]: + """ + Return the set of direct DOR target Uris for the given object and Tuple[filepath, mimetype] for external references. + """ + try: + return get_dor_or_external_uris_from_obj(obj) + except Exception as e: + self._handle_error(f"Error getting direct DOR URIs: {e}") + return set(), set() + + +def log_timestamp(func): + """Decorator to log timestamps for function execution.""" + + @wraps(func) + def wrapper(*args, **kwargs): + func_name = func.__name__ + start_time = time.perf_counter() + timestamp_start = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + + # Get file path from arguments if available + file_path = None + if args: + if isinstance(args[0], str) and (args[0].endswith(".epc") or "/" in args[0] or "\\" in args[0]): + file_path = args[0] + elif hasattr(args[0], "epc_file_path"): + file_path = args[0].epc_file_path + if "path" in kwargs: + file_path = kwargs["path"] + elif "epc_file_path" in kwargs: + file_path = kwargs["epc_file_path"] + + path_info = f" [{file_path}]" if file_path else "" + print(f"⏱️ [{timestamp_start}] Starting {func_name}{path_info}") + + try: + result = func(*args, **kwargs) + elapsed = time.perf_counter() - start_time + timestamp_end = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + print(f"✅ [{timestamp_end}] Completed {func_name} in {elapsed:.3f}s{path_info}") + return result + except Exception as e: + elapsed = time.perf_counter() - start_time + timestamp_end = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + print(f"❌ [{timestamp_end}] Failed {func_name} after {elapsed:.3f}s{path_info}: {e}") + raise + + return wrapper + + +# --- HELPER FUNCTIONS (HORS CLASSE) --- + + +def _parallel_xml_read(xml_data: bytes, content_type: str): + try: + + target_class = get_class_from_content_type(content_type) + obj = read_energyml_xml_bytes(xml_data, target_class) + + if isinstance(obj, DerivedElement): + obj = obj.value + return obj + except Exception as e: + return e + + +def _parallel_rels_read(rels_bytes: bytes): + try: + return read_energyml_xml_bytes(rels_bytes, Relationships) + except Exception as e: + return e + + +def _parallel_xml_serialize(obj): + """Sérialise un objet Python en XML bytes dans un processus séparé.""" + try: + return serialize_xml(obj) + except Exception as e: + return e @dataclass class Epc(EnergymlStorageInterface): """ - A class that represent an EPC file content + A class that represent an EPC file content. Creating an isntance of this class with a file path will not directly load the file content if it exists. + To read an existing file, use the @read_file or @read_stream functions. + Moreover, you must explicitly call @export_file or @export_io functions to save the content of the instance. """ # content_type: List[str] = field( @@ -103,11 +752,11 @@ class Epc(EnergymlStorageInterface): export_version: EpcExportVersion = field(default=EpcExportVersion.CLASSIC) - core_props: CoreProperties = field(default=None) + core_props: Optional[CoreProperties] = field(default=None) """ xml files referred in the [Content_Types].xml """ - energyml_objects: List = field( - default_factory=list, + energyml_objects: EnergymlObjectCollection = field( + default_factory=EnergymlObjectCollection, ) """ Other files content like pdf etc """ @@ -127,18 +776,27 @@ class Epc(EnergymlStorageInterface): force_h5_path: Optional[str] = field(default=None) + """ Relationships cache for efficient rels computation and management """ + _rels_cache: Optional[EpcRelsCache] = field(default=None, init=False, repr=False) + """ - Additional rels for objects. Key is the object (same than in @energyml_objects) and value is a list of + Additional rels for objects (DEPRECATED - use _rels_cache.add_supplemental_rels instead). + Key is the object (same than in @energyml_objects) and value is a list of RelationShip. This can be used to link an HDF5 to an ExternalPartReference in resqml 2.0.1 Key is a value returned by @get_obj_identifier """ - additional_rels: Dict[str, List[Relationship]] = field(default_factory=lambda: {}) + # additional_rels: Dict[str, List[Relationship]] = field(default_factory=lambda: {}) """ Epc file path. Used when loaded from a local file or for export """ epc_file_path: Optional[str] = field(default=None) + def __post_init__(self): + """Initialize the relationships cache after dataclass initialization.""" + if self._rels_cache is None: + self._rels_cache = EpcRelsCache(self, export_version=self.export_version) + def __str__(self): return ( "EPC file (" @@ -213,450 +871,772 @@ def add_file(self, obj: Union[List, bytes, BytesIO, str, RawFile]): else: logging.error(f"unsupported type {str(type(obj))}") - # EXPORT functions + # === Relationships management functions === - def gen_opc_content_type(self) -> Types: + def add_rels_for_object( + self, + obj: Any, + relationships: List[Relationship], + ) -> None: """ - Generates a :class:`Types` instance and fill it with energyml objects :class:`Override` values + Add relationships to an object in the EPC stream + :param obj: + :param relationships: :return: """ - ct = Types() - rels_default = Default() - rels_default.content_type = RELS_CONTENT_TYPE - rels_default.extension = "rels" - - ct.default = [rels_default] - ct.override = [] - for e_obj in self.energyml_objects: - ct.override.append( - Override( - content_type=get_content_type_from_class(type(e_obj)), - part_name=gen_energyml_object_path(e_obj, self.export_version), - ) - ) + self._rels_cache.add_supplemental_rels(obj, relationships) - if self.core_props is not None: - ct.override.append( - Override( - content_type=get_content_type_from_class(self.core_props), - part_name=gen_core_props_path(self.export_version), - ) - ) + # if isinstance(obj, str) or isinstance(obj, Uri): + # obj = self.get_object_by_identifier(obj) + # obj_ident = get_obj_identifier(obj) + # else: + # obj_ident = get_obj_identifier(obj) + # if obj_ident not in self.additional_rels: + # self.additional_rels[obj_ident] = [] - return ct + # self.additional_rels[obj_ident] = self.additional_rels[obj_ident] + relationships - def export_file(self, path: Optional[str] = None) -> None: + def rels_to_h5_file(self, obj: Any, h5_path: str) -> Relationship: """ - Export the epc file. If :param:`path` is None, the epc 'self.epc_file_path' is used - :param path: - :return: + Creates in the epc file, a Relation (in the object .rels file) to link a h5 external file. + Usually this function is used to link an ExternalPartReference to a h5 file. + :param obj: + :param h5_path: + :return: the Relationship added to the rels cache """ - if path is None: - path = self.epc_file_path + # obj_ident = get_obj_identifier(obj) + # if obj_ident not in self.additional_rels: + # self.additional_rels[obj_ident] = [] - # Ensure directory exists - if path is not None: - Path(path).parent.mkdir(parents=True, exist_ok=True) - epc_io = self.export_io() - with open(path, "wb") as f: - f.write(epc_io.getbuffer()) + nb_current_file = len(self.get_h5_file_paths(obj)) - def export_io(self) -> BytesIO: + rel = create_h5_external_relationship(h5_path=h5_path, current_idx=nb_current_file) + # self.additional_rels[obj_ident].append(rel) + self._rels_cache.add_supplemental_rels(obj, rel) + return rel + + def get_obj_rels(self, obj: Union[str, Uri, Any]) -> List[Relationship]: """ - Export the epc file into a :class:`BytesIO` instance. The result is an 'in-memory' zip file. - :return: + Get the relationships for a given energyml object using the cache. + :param obj: The object identifier/URI or the object itself + :return: List of Relationship objects """ - zip_buffer = BytesIO() + # Ensure cache is initialized + if self._rels_cache is None: + self._rels_cache = EpcRelsCache(self, export_version=self.export_version) - with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file: - # CoreProps - if self.core_props is None: - self.core_props = CoreProperties( - created=Created(any_element=epoch_to_date(epoch())), - creator=Creator(any_element="energyml-utils python module (Geosiris)"), - identifier=Identifier(any_element=f"urn:uuid:{gen_uuid()}"), - keywords=Keywords1( - lang="en", - content=["generated;Geosiris;python;energyml-utils"], - ), - version="1.0", - ) + # Convert identifier to object if needed + if isinstance(obj, str) or isinstance(obj, Uri): + obj = self.get_object_by_identifier(obj) + if obj is None: + return [] - zip_info_core = zipfile.ZipInfo( - filename=gen_core_props_path(self.export_version), - date_time=datetime.datetime.now().timetuple()[:6], - ) - data = serialize_xml(self.core_props) - zip_file.writestr(zip_info_core, data) + # Get relationships from cache (includes computed + supplemental rels) + return self._rels_cache.get_object_rels(obj) - # Energyml objects - for e_obj in self.energyml_objects: - e_path = gen_energyml_object_path(e_obj, self.export_version) - zip_info = zipfile.ZipInfo( - filename=e_path, - date_time=datetime.datetime.now().timetuple()[:6], - ) - data = serialize_xml(e_obj) - zip_file.writestr(zip_info, data) - - # Rels - for rels_path, rels in self.compute_rels().items(): - zip_info = zipfile.ZipInfo( - filename=rels_path, - date_time=datetime.datetime.now().timetuple()[:6], - ) - data = serialize_xml(rels) - zip_file.writestr(zip_info, data) - - # Other files: - for raw in self.raw_files: - zip_info = zipfile.ZipInfo( - filename=raw.path, - date_time=datetime.datetime.now().timetuple()[:6], - ) - zip_file.writestr(zip_info, raw.content.read()) + def update_rels_cache(self) -> None: + """Update the relationships cache for all objects. This should be called after any modification to the energyml objects to keep the cache consistent.""" + if self._rels_cache is None: + self._rels_cache = EpcRelsCache(self, export_version=self.export_version) + self._rels_cache.recompute_cache() - # ContentType - zip_info_ct = zipfile.ZipInfo( - filename=get_epc_content_type_path(), - date_time=datetime.datetime.now().timetuple()[:6], - ) - data = serialize_xml(self.gen_opc_content_type()) - zip_file.writestr(zip_info_ct, data) + def clean_rels_cache(self, obj: Any = None) -> None: + """Clean relationships for a specific object in the cache. If no object is provided, clean all relationships in the cache. This will remove duplicates and ensure consistency between computed and supplemental relationships.""" + if self._rels_cache is not None: + self._rels_cache.clean_rels(obj) - return zip_buffer + def clear_rels_cache(self) -> None: + """Clear the relationships cache. This will remove all computed and supplemental relationships, forcing a full recomputation on next access.""" + if self._rels_cache is not None: + self._rels_cache.clear_cache() - def get_obj_rels(self, obj: Any) -> Optional[Relationships]: + def compute_rels(self, force_recompute_object_rels: bool = False) -> Dict[str, Relationships]: """ - Get the Relationships object for a given energyml object - :param obj: - :return: + Compute all relationships in the EPC file. + :param force_recompute_object_rels: If True, recompute all object relationships from scratch + :return: Dictionary mapping rels file paths to Relationships objects """ - rels_path = gen_rels_path( - energyml_object=obj, - export_version=self.export_version, - ) - all_rels = self.compute_rels() - if rels_path in all_rels: - return all_rels[rels_path] - return None + # Ensure cache is initialized + if self._rels_cache is None: + self._rels_cache = EpcRelsCache(self, export_version=self.export_version) - def compute_rels(self) -> Dict[str, Relationships]: - """ - Returns a dict containing for each objet, the rels xml file path as key and the RelationShips object as value - :return: - """ - dor_relation = get_reverse_dor_list(self.energyml_objects) + # Recompute cache if requested + if force_recompute_object_rels: + self._rels_cache.recompute_cache() - # destObject - rels = { - obj_id: [ - Relationship( - target=gen_energyml_object_path(target_obj, self.export_version), - type_value=EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(), - id=f"_{obj_id}_{get_obj_type(get_obj_usable_class(target_obj))}_{get_obj_identifier(target_obj)}", - ) - for target_obj in target_obj_list - ] - for obj_id, target_obj_list in dor_relation.items() - } - # sourceObject - for obj in self.energyml_objects: - obj_id = get_obj_identifier(obj) - if obj_id not in rels: - rels[obj_id] = [] - for target_obj in get_direct_dor_list(obj): - try: - rels[obj_id].append( - Relationship( - target=gen_energyml_object_path(target_obj, self.export_version), - type_value=EPCRelsRelationshipType.SOURCE_OBJECT.get_type(), - id=f"_{obj_id}_{get_obj_type(get_obj_usable_class(target_obj))}_{get_obj_identifier(target_obj)}", - ) - ) - except Exception: - logging.error(f'Failed to create rels for "{obj_id}" with target {target_obj}') + result = {} - # filtering non-accessible objects from DOR - rels = {k: v for k, v in rels.items() if self.get_object_by_identifier(k) is not None} + # all energyml objects - get relationships from cache + for obj in self.energyml_objects: + obj_file_rels_path = gen_rels_path(obj, export_version=self.export_version) - map_obj_id_to_obj = {get_obj_identifier(obj): obj for obj in self.energyml_objects} + # Get relationships from cache (includes computed + supplemental) + cached_rels = self._rels_cache.get_object_rels(obj) - obj_rels = { - gen_rels_path( - energyml_object=map_obj_id_to_obj.get(obj_id), - export_version=self.export_version, - ): Relationships( - relationship=obj_rels + (self.additional_rels[obj_id] if obj_id in self.additional_rels else []), - ) - for obj_id, obj_rels in rels.items() - } + result[obj_file_rels_path] = Relationships(relationship=cached_rels) # CoreProps - if self.core_props is not None: - obj_rels[gen_rels_path(self.core_props)] = Relationships( - relationship=[ + core_props = self.core_props or create_default_core_properties() + core_props_rels_path = gen_rels_path(core_props, self.export_version) + result[core_props_rels_path] = Relationships(relationship=[]) + for rf in self.raw_files: + if is_core_prop_or_extension_path(rf.path): + result[core_props_rels_path].relationship.append( Relationship( - target=gen_core_props_path(), - type_value=EPCRelsRelationshipType.CORE_PROPERTIES.get_type(), - id="CoreProperties", + target=rf.path, + type_value=str(EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES), + id=f"_{gen_uuid()}", ) - ] - ) + ) - return obj_rels + # ContentType + content_type_path_rels = get_epc_content_type_rels_path() + result[content_type_path_rels] = Relationships( + relationship=[ + Relationship( + id="CoreProperties", + type_value=str(EPCRelsRelationshipType.CORE_PROPERTIES), + target=gen_core_props_path(), + ) + ] + ) - def rels_to_h5_file(self, obj: Any, h5_path: str) -> Relationship: - """ - Creates in the epc file, a Relation (in the object .rels file) to link a h5 external file. - Usually this function is used to link an ExternalPartReference to a h5 file. - In practice, the Relation object is added to the "additional_rels" of the current epc file. - :param obj: - :param h5_path: - :return: the Relationship added to the epc.additional_rels dict - """ - obj_ident = get_obj_identifier(obj) - if obj_ident not in self.additional_rels: - self.additional_rels[obj_ident] = [] + return result - nb_current_file = len(self.get_h5_file_paths(obj)) + # === Array functions === - rel = create_h5_external_relationship(h5_path=h5_path, current_idx=nb_current_file) - self.additional_rels[obj_ident].append(rel) - return rel + def get_epc_file_folder(self) -> Optional[str]: + return get_file_folder(self.epc_file_path) if self.epc_file_path else None - def get_h5_file_paths(self, obj: Any) -> List[str]: - """ - Get all HDF5 file paths referenced in the EPC file (from rels to external resources) - :return: list of HDF5 file paths + def read_external_array( + self, + energyml_array: Any, + root_obj: Optional[Any] = None, + path_in_root: Optional[str] = None, + use_epc_io_h5: bool = True, + ) -> List[Any]: + """Read an external array from HDF5 files linked to the EPC file. + :param energyml_array: the energyml array object (e.g. FloatingPointExternalArray) + :param root_obj: the root object containing the energyml_array + :param path_in_root: the path in the root object to the energyml_array + :param use_epc_io_h5: if True, use also the in-memory HDF5 files stored in epc.h5_io_files + + :return: the array read from the external datasets """ + sources = [] + if self is not None and use_epc_io_h5 and self.h5_io_files is not None and len(self.h5_io_files): + sources = sources + self.h5_io_files - if self.force_h5_path is not None: - return [self.force_h5_path] + return read_external_dataset_array( + energyml_array=energyml_array, + root_obj=root_obj, + path_in_root=path_in_root, + additional_sources=sources, + epc=self, + ) - is_uri = (isinstance(obj, str) and parse_uri(obj) is not None) or isinstance(obj, Uri) - if is_uri: - obj = self.get_object_by_identifier(obj) + def read_array( + self, + proxy: Union[str, Uri, Any], + path_in_external: str, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + external_uri: Optional[str] = None, + ) -> Optional[np.ndarray]: + """ + Read a data array from external storage (HDF5, Parquet, CSV, etc.) with optional sub-selection. + + :param proxy: The object identifier/URI or the object itself that references the array + :param path_in_external: Path within the external file (e.g., 'values/0') + :param start_indices: Optional start index for each dimension (RESQML v2.2 StartIndex) + :param counts: Optional count of elements for each dimension (RESQML v2.2 Count) + :param external_uri: Optional URI to override default file path (RESQML v2.2 URI) + :return: The data array as a numpy array, or None if not found + """ + obj = proxy + if isinstance(proxy, str) or isinstance(proxy, Uri): + obj = self.get_object_by_identifier(proxy) - h5_paths = set() + # Determine which external files to use + file_paths = self.get_h5_file_paths(obj) + if external_uri: + file_paths.insert(0, make_path_relative_to_other_file(external_uri, self.epc_file_path)) - if isinstance(obj, str): - obj = self.get_object_by_identifier(obj) - for rels in self.additional_rels.get(get_obj_identifier(obj), []): - if rels.type_value == EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(): - h5_paths.add(rels.target) + if not file_paths or len(file_paths) == 0: + file_paths = self.external_files_path - if len(h5_paths) == 0: - # search if an h5 file has the same name than the epc file - epc_folder = self.get_epc_file_folder() - if epc_folder is not None and self.epc_file_path is not None: - epc_file_name = os.path.basename(self.epc_file_path) - epc_file_base, _ = os.path.splitext(epc_file_name) - possible_h5_path = os.path.join(epc_folder, epc_file_base + ".h5") - if os.path.exists(possible_h5_path): - h5_paths.add(possible_h5_path) - return list(h5_paths) + if not file_paths: + logging.warning(f"No external file paths found for proxy: {proxy}") + return None - def get_object_as_dor(self, identifier: str, dor_qualified_type) -> Optional[Any]: - """ - Search an object by its identifier and returns a DOR - :param identifier: - :param dor_qualified_type: the qualified type of the DOR (e.g. resqml22.DataObjectReference) - :return: - """ - obj = self.get_object_by_identifier(identifier=identifier) - # if obj is None: + # Get the file handler registry + handler_registry = get_handler_registry() - return as_dor(obj_or_identifier=obj or identifier, dor_qualified_type=dor_qualified_type) + for file_path in file_paths: + # Get the appropriate handler for this file type + handler = handler_registry.get_handler_for_file(file_path) + if handler is None: + logging.debug(f"No handler found for file: {file_path}") + continue - def get_object_by_uuid(self, uuid: str) -> List[Any]: - """ - Search all objects with the uuid :param:`uuid`. - :param uuid: - :return: - """ - return list(filter(lambda o: get_obj_uuid(o) == uuid, self.energyml_objects)) + try: + # Use handler to read array with sub-selection support + array = handler.read_array(file_path, path_in_external, start_indices, counts) + if array is not None: + return array + except Exception as e: + logging.debug(f"Failed to read dataset from {file_path}: {e}") + pass - def get_object_by_identifier(self, identifier: Union[str, Uri]) -> Optional[Any]: - """ - Search an object by its identifier. - :param identifier: given by the function :func:`get_obj_identifier`, or a URI (or its str representation) - :return: - """ - is_uri = isinstance(identifier, Uri) or parse_uri(identifier) is not None - id_str = str(identifier) - for o in self.energyml_objects: - if (get_obj_identifier(o) if not is_uri else str(get_obj_uri(o))) == id_str: - return o + logging.error(f"Failed to read array from any available file paths: {file_paths}") return None - def get_object(self, identifier: Union[str, Uri]) -> Optional[Any]: - return self.get_object_by_identifier(identifier) - + def read_array_view( + self, + proxy: Union[str, Uri, Any], + path_in_external: str, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + external_uri: Optional[str] = None, + ) -> Optional[np.ndarray]: + """Best-effort zero-copy variant of :meth:`read_array`. + + Delegates to ``handler.read_array_view`` when available (HDF5), which + returns a numpy array backed by the file buffer for contiguous, + uncompressed datasets. Falls back transparently to a copy for chunked + or compressed data. + """ + obj = proxy + if isinstance(proxy, str) or isinstance(proxy, Uri): + obj = self.get_object_by_identifier(proxy) + + file_paths = self.get_h5_file_paths(obj) + if external_uri: + file_paths.insert(0, make_path_relative_to_other_file(external_uri, self.epc_file_path)) + if not file_paths or len(file_paths) == 0: + file_paths = self.external_files_path + if not file_paths: + return None + + handler_registry = get_handler_registry() + for file_path in file_paths: + handler = handler_registry.get_handler_for_file(file_path) + if handler is None: + continue + try: + read_view_fn = getattr(handler, "read_array_view", None) + if read_view_fn is not None: + array = read_view_fn(file_path, path_in_external, start_indices, counts) + else: + array = handler.read_array(file_path, path_in_external, start_indices, counts) + if array is not None: + return array + except Exception as e: + logging.debug(f"Failed to read_array_view from {file_path}: {e}") + return None + + def write_array( + self, + proxy: Union[str, Uri, Any], + path_in_external: str, + array: np.ndarray, + start_indices: Optional[List[int]] = None, + external_uri: Optional[str] = None, + **kwargs, + ) -> bool: + """ + Write a data array to external storage (HDF5, Parquet, CSV, etc.) with optional offset. + + :param proxy: The object identifier/URI or the object itself that references the array + :param path_in_external: Path within the external file (e.g., 'values/0') + :param array: The numpy array to write + :param start_indices: Optional start index for each dimension for partial writes + :param external_uri: Optional URI to override default file path (RESQML v2.2 URI) + :param kwargs: Additional format-specific parameters (e.g., dtype, column_titles) + :return: True if successfully written, False otherwise + """ + obj = proxy + if isinstance(proxy, str) or isinstance(proxy, Uri): + obj = self.get_object_by_identifier(proxy) + + # Determine which external files to use + file_paths = self.get_h5_file_paths(obj) + if external_uri: + file_paths.insert(0, make_path_relative_to_other_file(external_uri, self.epc_file_path)) + + if not file_paths or len(file_paths) == 0: + file_paths = self.external_files_path + + if not file_paths: + logging.warning(f"No external file paths found for proxy: {proxy}") + return False + + # Get the file handler registry + handler_registry = get_handler_registry() + + # Try to write to the first available file + for file_path in file_paths: + # Get the appropriate handler for this file type + handler = handler_registry.get_handler_for_file(file_path) + if handler is None: + logging.debug(f"No handler found for file: {file_path}") + continue + + try: + # Use handler to write array with optional partial write support + success = handler.write_array(file_path, array, path_in_external, start_indices, **kwargs) + if success: + return True + except Exception as e: + logging.error(f"Failed to write dataset to {file_path}: {e}") + + logging.error(f"Failed to write array to any available file paths: {file_paths}") + return False + + def get_array_metadata( + self, + proxy: Union[str, Uri, Any], + path_in_external: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Union[DataArrayMetadata, List[DataArrayMetadata], None]: + """ + Get metadata for data array(s) without loading the full array data. + Supports RESQML v2.2 sub-array selection metadata. + + :param proxy: The object identifier/URI or the object itself that references the array + :param path_in_external: Optional specific path. If None, returns all array metadata for the object + :param start_indices: Optional start index for each dimension (RESQML v2.2 StartIndex) + :param counts: Optional count of elements for each dimension (RESQML v2.2 Count) + :return: DataArrayMetadata if path specified, List[DataArrayMetadata] if no path, or None if not found + """ + obj = proxy + if isinstance(proxy, str) or isinstance(proxy, Uri): + obj = self.get_object_by_identifier(proxy) + + # Get possible file paths for this object + file_paths = self.get_h5_file_paths(obj) + if not file_paths or len(file_paths) == 0: + file_paths = self.external_files_path + + if not file_paths: + logging.warning(f"No external file paths found for proxy: {proxy}") + return None + + # Get the file handler registry + handler_registry = get_handler_registry() + + for file_path in file_paths: + # Get the appropriate handler for this file type + handler = handler_registry.get_handler_for_file(file_path) + if handler is None: + logging.debug(f"No handler found for file: {file_path}") + continue + + try: + # Use handler to get metadata without loading full array + metadata_dict = handler.get_array_metadata(file_path, path_in_external, start_indices, counts) + + if metadata_dict is None: + continue + + # Convert dict(s) to DataArrayMetadata + if isinstance(metadata_dict, list): + return [ + DataArrayMetadata( + path_in_resource=m.get("path"), + array_type=m.get("dtype", "unknown"), + dimensions=m.get("shape", []), + start_indices=start_indices, + custom_data={"size": m.get("size", 0)}, + ) + for m in metadata_dict + ] + else: + return DataArrayMetadata( + path_in_resource=metadata_dict.get("path"), + array_type=metadata_dict.get("dtype", "unknown"), + dimensions=metadata_dict.get("shape", []), + start_indices=start_indices, + custom_data={"size": metadata_dict.get("size", 0)}, + ) + except Exception as e: + logging.debug(f"Failed to get metadata from file {file_path}: {e}") + + return None + + def get_h5_file_paths(self, obj_or_id: Optional[Any] = None) -> List[str]: + """ + Get all HDF5 file paths referenced in the EPC file (from rels to external resources) + :return: list of HDF5 file paths + """ + + if self.force_h5_path is not None: + return [self.force_h5_path] + h5_paths = set() + + if obj_or_id is None: + return [self.epc_file_path.replace(".epc", ".h5")] if self.epc_file_path else [] + + obj = self.get_object(obj_or_id) if isinstance(obj_or_id, (str, Uri)) else obj_or_id + + # for rels in self.additional_rels.get(get_obj_identifier(obj), []): + for rels in self._rels_cache.get_supplemental_rels(obj): + if rels.type_value == EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(): + h5_paths.add(rels.target) + + h5_paths = set(make_path_relative_to_filepath_list(list(h5_paths), self.epc_file_path)) + + # if len(h5_paths) == 0: + # Collect all .h5 files in the EPC file's folder + epc_folder = self.get_epc_file_folder() + if epc_folder is not None and os.path.isdir(epc_folder): + for fname in os.listdir(epc_folder): + if fname.lower().endswith(".h5"): + h5_paths.add(os.path.join(epc_folder, fname)) + + return list(h5_paths) + + def get_object_as_dor(self, identifier: str, dor_qualified_type) -> Optional[Any]: + """ + Search an object by its identifier and returns a DOR + :param identifier: + :param dor_qualified_type: the qualified type of the DOR (e.g. resqml22.DataObjectReference) + :return: + """ + obj = self.get_object_by_identifier(identifier=identifier) + # if obj is None: + + return as_dor(obj_or_identifier=obj or identifier, dor_qualified_type=dor_qualified_type) + + def get_object_by_uuid(self, uuid: str) -> List[Any]: + """ + Search all objects with the uuid :param:`uuid`. + :param uuid: + :return: + """ + return self.energyml_objects.get_by_uuid(uuid) + + def get_object_by_identifier(self, identifier: Union[str, Uri]) -> Optional[Any]: + """ + Search an object by its identifier. + :param identifier: given by the function :func:`get_obj_identifier`, or a URI (or its str representation) + :return: + """ + # Use the O(1) dict lookup from the collection + return self.energyml_objects.get_by_identifier(identifier) + + def get_object(self, identifier: Union[str, Uri]) -> Optional[Any]: + return self.get_object_by_identifier(identifier) + def add_object(self, obj: Any) -> bool: """ - Add an energyml object to the EPC stream + Add an energyml object to the EPC stream (calls put_object for consistency) :param obj: :return: """ - self.energyml_objects.append(obj) - return True + return self.put_object(obj) is not None def remove_object(self, identifier: Union[str, Uri]) -> None: """ - Remove an energyml object from the EPC stream by its identifier + Remove an energyml object from the EPC stream by its identifier (calls delete_object for consistency) :param identifier: :return: """ + self.delete_object(identifier) + + def __len__(self) -> int: + return len(self.energyml_objects) + + def list_objects(self, dataspace: str | None = None, object_type: str | None = None) -> List[ResourceMetadata]: + result = [] + for obj in self.energyml_objects: + if (dataspace is None or get_obj_type(get_obj_usable_class(obj)) == dataspace) and ( + object_type is None or get_qualified_type_from_class(type(obj)) == object_type + ): + res_meta = ResourceMetadata( + uri=str(get_obj_uri(obj)), + uuid=get_obj_uuid(obj), + title=get_object_attribute(obj, "citation.title") or "", + object_type=type(obj).__name__, + version=get_obj_version(obj), + content_type=get_content_type_from_class(type(obj)) or "", + ) + result.append(res_meta) + return result + + def put_object(self, obj: Any, dataspace: str | None = None) -> str | None: + """ + Add or update an energyml object in the EPC stream. + :param obj: The energyml object to add + :param dataspace: Optional dataspace parameter (for interface compatibility) + :return: The URI of the added object, or None if failed + """ + self.energyml_objects.append(obj) + + # Update relationships cache + if self._rels_cache is None: + self._rels_cache = EpcRelsCache(self, export_version=self.export_version) + self._rels_cache.update_cache_for_object(obj) + + return str(get_obj_uri(obj)) + + def delete_object(self, identifier: Union[str, Any]) -> bool: + """ + Delete an energyml object from the EPC stream. + :param identifier: The object identifier/URI or the object itself + :return: True if object was deleted, False otherwise + """ obj = self.get_object_by_identifier(identifier) if obj is not None: + # Remove from collection self.energyml_objects.remove(obj) - def __len__(self) -> int: - return len(self.energyml_objects) + # Update relationships cache + if self._rels_cache is None: + self._rels_cache = EpcRelsCache(self, export_version=self.export_version) + self._rels_cache._remove_object_from_cache(obj) - def add_rels_for_object( + return True + return False + + def dumps_epc_content_and_files_lists(self) -> str: + """ + Dumps the EPC content and files lists for debugging purposes. + :return: A string representation of the EPC content and files lists. + """ + content_list = [ + f"{get_obj_identifier(obj)} ({get_qualified_type_from_class(type(obj))})" for obj in self.energyml_objects + ] + raw_files_list = [raw_file.path for raw_file in self.raw_files] + + return "EPC Content:\n" + "\n".join(content_list) + "\n\nRaw Files:\n" + "\n".join(raw_files_list) + + def close(self) -> None: + """ + Close the EPC file and release any resources. + :return: + """ + pass + + # EXPORT functions + + def gen_opc_content_type(self) -> Types: + """ + Generates a :class:`Types` instance and fill it with energyml objects :class:`Override` values + :return: + """ + ct = create_default_types() + + for e_obj in self.energyml_objects: + ct.override.append( + Override( + content_type=get_content_type_from_class(type(e_obj)), + part_name=gen_energyml_object_path(e_obj, self.export_version), + ) + ) + + for rf in self.raw_files: + # file_extension = os.path.splitext(file_path)[1].lstrip(".").lower() + mime_type = in_epc_file_path_to_mime_type(rf.path) + if mime_type: + override = Override(content_type=mime_type, part_name=f"{rf.path}") + ct.override.append(override) + + return ct + + # @log_timestamp + def export_file( self, - obj: Any, - relationships: List[Relationship], + path: Optional[str] = None, + allowZip64: bool = True, + force_recompute_object_rels: bool = True, + parallel: bool = False, ) -> None: """ - Add relationships to an object in the EPC stream - :param obj: - :param relationships: + Export the epc file. If :param:`path` is None, the epc 'self.epc_file_path' is used + :param path: :return: """ + if path is None: + path = self.epc_file_path - if isinstance(obj, str) or isinstance(obj, Uri): - obj = self.get_object_by_identifier(obj) - obj_ident = get_obj_identifier(obj) - else: - obj_ident = get_obj_identifier(obj) - if obj_ident not in self.additional_rels: - self.additional_rels[obj_ident] = [] + if path is None: + raise ValueError("No path provided and epc_file_path is not set") - self.additional_rels[obj_ident] = self.additional_rels[obj_ident] + relationships + # Ensure directory exists + Path(path).parent.mkdir(parents=True, exist_ok=True) - def get_epc_file_folder(self) -> Optional[str]: - if self.epc_file_path is not None and len(self.epc_file_path) > 0: - folders_and_name = re.split(r"[\\/]", self.epc_file_path) - if len(folders_and_name) > 1: - return "/".join(folders_and_name[:-1]) + with zipfile.ZipFile(path, "w", zipfile.ZIP_DEFLATED, allowZip64=allowZip64) as zip_file: + if parallel: + self._export_io_ultra_fast(zip_file=zip_file, force_recompute_object_rels=force_recompute_object_rels) else: - return "" - return None + self._export_io( + zip_file=zip_file, allowZip64=allowZip64, force_recompute_object_rels=force_recompute_object_rels + ) - def read_external_array( - self, - energyml_array: Any, - root_obj: Optional[Any] = None, - path_in_root: Optional[str] = None, - use_epc_io_h5: bool = True, - ) -> List[Any]: - """Read an external array from HDF5 files linked to the EPC file. - :param energyml_array: the energyml array object (e.g. FloatingPointExternalArray) - :param root_obj: the root object containing the energyml_array - :param path_in_root: the path in the root object to the energyml_array - :param use_epc_io_h5: if True, use also the in-memory HDF5 files stored in epc.h5_io_files + def export_io(self, allowZip64: bool = True, force_recompute_object_rels: bool = True) -> BytesIO: + """ + Export the epc file into a :class:`BytesIO` instance. The result is an 'in-memory' zip file. + :return: + """ + zip_buffer = BytesIO() - :return: the array read from the external datasets + with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED, allowZip64=allowZip64) as zip_file: + self._export_io( + zip_file=zip_file, allowZip64=allowZip64, force_recompute_object_rels=force_recompute_object_rels + ) + + return zip_buffer + + def _export_io( + self, zip_file: zipfile.ZipFile, allowZip64: bool = True, force_recompute_object_rels: bool = True + ) -> None: """ - sources = [] - if self is not None and use_epc_io_h5 and self.h5_io_files is not None and len(self.h5_io_files): - sources = sources + self.h5_io_files + Export the epc file into a :class:`BytesIO` instance. The result is an 'in-memory' zip file. + :return: + """ + # CoreProps + if self.core_props is None: + self.core_props = create_default_core_properties() - return read_external_dataset_array( - energyml_array=energyml_array, - root_obj=root_obj, - path_in_root=path_in_root, - additional_sources=sources, - epc=self, - ) + zip_file.writestr(gen_core_props_path(self.export_version), serialize_xml(self.core_props)) - def read_array(self, proxy: Union[str, Uri, Any], path_in_external: str) -> Optional[np.ndarray]: - obj = proxy - if isinstance(proxy, str) or isinstance(proxy, Uri): - obj = self.get_object_by_identifier(proxy) + # Energyml objects + for e_obj in self.energyml_objects: + e_path = gen_energyml_object_path(e_obj, self.export_version) + zip_file.writestr(e_path, serialize_xml(e_obj)) - h5_path = self.get_h5_file_paths(obj) - h5_reader = HDF5FileReader() + # Rels + for rels_path, rels in self.compute_rels(force_recompute_object_rels=force_recompute_object_rels).items(): + zip_file.writestr(rels_path, serialize_xml(rels)) - if h5_path is None or len(h5_path) == 0: - for h5_path in self.external_files_path: - try: - return h5_reader.read_array(source=h5_path, path_in_external_file=path_in_external) - except Exception: - pass - # logging.error(f"Failed to read HDF5 dataset from {h5_path}: {e}") - else: - for h5p in h5_path: - try: - return h5_reader.read_array(source=h5p, path_in_external_file=path_in_external) - except Exception: - pass - # logging.error(f"Failed to read HDF5 dataset from {h5p}: {e}") - return None + # Other files: + for raw in self.raw_files: + zip_file.writestr(raw.path, raw.content.read()) - def write_array( - self, proxy: Union[str, Uri, Any], path_in_external: str, array: Any, in_memory: bool = False - ) -> bool: - """ - Write a dataset in the HDF5 file linked to the proxy object. - :param proxy: the object or its identifier - :param path_in_external: the path in the external file - :param array: the data to write - :param in_memory: if True, write in the in-memory HDF5 files (epc.h5_io_files) + # ContentType + zip_file.writestr(get_epc_content_type_path(), serialize_xml(self.gen_opc_content_type())) - :return: True if successful - """ - obj = proxy - if isinstance(proxy, str) or isinstance(proxy, Uri): - obj = self.get_object_by_identifier(proxy) + def _export_io_ultra_fast(self, zip_file: zipfile.ZipFile, force_recompute_object_rels: bool = True) -> None: + import multiprocessing + from concurrent.futures import ProcessPoolExecutor, as_completed - h5_path = self.get_h5_file_paths(obj) - h5_writer = HDF5FileWriter() + # 1. Préparation des données + if self.core_props is None: + self.core_props = create_default_core_properties() - if in_memory or h5_path is None or len(h5_path) == 0: - for h5_path in self.external_files_path: - try: - h5_writer.write_array(target=h5_path, path_in_external_file=path_in_external, array=array) - return True - except Exception: - pass - # logging.error(f"Failed to write HDF5 dataset to {h5_path}: {e}") + # On prépare la liste des objets à sérialiser + # Format: { future: (path_dans_le_zip) } + serialization_tasks = {} - for h5p in h5_path: - try: - h5_writer.write_array(target=h5p, path_in_external_file=path_in_external, array=array) - return True - except Exception: - pass - # logging.error(f"Failed to write HDF5 dataset to {h5p}: {e}") - return False + cpus = multiprocessing.cpu_count() + with ProcessPoolExecutor(max_workers=cpus) as executor: + # A. On lance la sérialisation des objets EnergyML + for e_obj in self.energyml_objects: + e_path = gen_energyml_object_path(e_obj, self.export_version) + future = executor.submit(_parallel_xml_serialize, e_obj) + serialization_tasks[future] = e_path + + # B. On lance la sérialisation des Rels + # Note: on compute les rels dans le thread principal car c'est souvent + # une opération de logique métier sur le graphe d'objets. + computed_rels = self.compute_rels(force_recompute_object_rels=force_recompute_object_rels) + for rels_path, rels_obj in computed_rels.items(): + future = executor.submit(_parallel_xml_serialize, rels_obj) + serialization_tasks[future] = rels_path + + # C. Cas particuliers (souvent rapides, mais autant les paralléliser) + core_path = gen_core_props_path(self.export_version) + future_core = executor.submit(_parallel_xml_serialize, self.core_props) + serialization_tasks[future_core] = core_path + + ct_path = get_epc_content_type_path() + future_ct = executor.submit(_parallel_xml_serialize, self.gen_opc_content_type()) + serialization_tasks[future_ct] = ct_path + + # 2. Récupération et Écriture I/O (Séquentielle mais rapide) + # On écrit dans le ZIP au fur et à mesure que les sérialisations se terminent + for future in as_completed(serialization_tasks): + path = serialization_tasks[future] + xml_bytes = future.result() + + if isinstance(xml_bytes, Exception): + logging.error(f"Erreur sérialisation sur {path}: {xml_bytes}") + else: + zip_file.writestr(path, xml_bytes) + + # 3. Fichiers bruts (Raw files) + # Ils sont déjà en bytes (BytesIO), donc pas besoin de paralléliser + for raw in self.raw_files: + raw.content.seek(0) # Reset du curseur par sécurité + zip_file.writestr(raw.path, raw.content.read()) + + # ============== # Class methods + # ============== @classmethod - def read_file(cls, epc_file_path: str) -> "Epc": + # @log_timestamp + def read_file( + cls, + epc_file_path: str, + read_rels_from_files: bool = True, + recompute_rels: bool = False, + read_parallel: bool = False, + ) -> "Epc": + """ + Read an EPC file from disk. + :param epc_file_path: Path to the EPC file + :param read_rels_from_files: If True, populate cache from energyml.utils.rels files in the EPC + :param recompute_rels: If True, recompute all relationships after loading + :param read_parallel: If True, read the EPC file in parallel + :return: Epc instance + """ with open(epc_file_path, "rb") as f: - epc = cls.read_stream(BytesIO(f.read())) - epc.epc_file_path = epc_file_path - return epc + if read_parallel: + epc = ( + cls.read_stream_ultra_fast( + BytesIO(f.read()), read_rels_from_files=read_rels_from_files, recompute_rels=recompute_rels + ) + if not os.environ.get("EPC_FAST_V2", "0") == "1" + else cls.read_stream_ultra_fast_v2( + BytesIO(f.read()), read_rels_from_files=read_rels_from_files, recompute_rels=recompute_rels + ) + ) + else: + epc = cls.read_stream( + BytesIO(f.read()), read_rels_from_files=read_rels_from_files, recompute_rels=recompute_rels + ) + if epc is not None: + epc.epc_file_path = epc_file_path + return epc raise IOError(f"Failed to open EPC file {epc_file_path}") @classmethod - def read_stream(cls, epc_file_io: BytesIO): # returns an Epc instance + def read_stream( + cls, epc_file_io: BytesIO, read_rels_from_files: bool = True, recompute_rels: bool = False + ) -> Optional["Epc"]: # returns an Epc instance """ - :param epc_file_io: + Read an EPC file from a BytesIO stream. + :param epc_file_io: BytesIO containing the EPC file + :param read_rels_from_files: If True, populate cache from energyml.utils.rels files in the EPC + :param recompute_rels: If True, recompute all relationships after loading :return: an :class:`EPC` instance """ + print("Reading EPC file seq...") try: _read_files = [] obj_list = [] raw_file_list = [] - additional_rels = {} + # additional_rels = {} core_props = None + # Store rels files separately for potential cache population + rels_files_to_load = {} # {obj_path: Relationships} + path_to_obj = {} + with zipfile.ZipFile(epc_file_io, "r", zipfile.ZIP_DEFLATED) as epc_file: content_type_file_name = get_epc_content_type_path() content_type_info = None @@ -674,7 +1654,6 @@ def read_stream(cls, epc_file_io: BytesIO): # returns an Epc instance logging.error(f"No {content_type_file_name} file found") else: content_type_obj: Types = read_energyml_xml_bytes(epc_file.read(content_type_file_name)) - path_to_obj = {} for ov in content_type_obj.override: ov_ct = ov.content_type ov_path = ov.part_name @@ -724,14 +1703,11 @@ def read_stream(cls, epc_file_io: BytesIO): # returns an Epc instance # RELS FILES READING START # logging.debug(f"reading rels {f_info.filename}") - ( - rels_folder, - rels_file_name, - ) = get_file_folder_and_name_from_path(f_info.filename) - while rels_folder.endswith("/"): - rels_folder = rels_folder[:-1] - obj_folder = rels_folder[: rels_folder.rindex("/") + 1] if "/" in rels_folder else "" - obj_file_name = rels_file_name[:-5] # removing the ".rels" + rels_path = Path(f_info.filename) + obj_folder = ( + str(rels_path.parent.parent) + "/" if str(rels_path.parent.parent) != "." else "" + ) + obj_file_name = rels_path.stem # removing the ".rels" rels_file: Relationships = read_energyml_xml_bytes( epc_file.read(f_info.filename), Relationships, @@ -739,18 +1715,24 @@ def read_stream(cls, epc_file_io: BytesIO): # returns an Epc instance obj_path = obj_folder + obj_file_name if obj_path in path_to_obj: try: - additional_rels_key = get_obj_identifier(path_to_obj[obj_path]) - for rel in rels_file.relationship: - # logging.debug(f"\t\t{rel.type_value}") - if ( - rel.type_value != EPCRelsRelationshipType.DESTINATION_OBJECT.get_type() - and rel.type_value != EPCRelsRelationshipType.SOURCE_OBJECT.get_type() - and rel.type_value - != EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES.get_type() - ): # not a computable relation - if additional_rels_key not in additional_rels: - additional_rels[additional_rels_key] = [] - additional_rels[additional_rels_key].append(rel) + + # Store all rels for potential cache population + if read_rels_from_files: + rels_files_to_load[obj_path] = rels_file + + # additional_rels_key = get_obj_identifier(path_to_obj[obj_path]) + # # Keep only non-computable rels in additional_rels (legacy support) + # for rel in rels_file.relationship: + # # logging.debug(f"\t\t{rel.type_value}") + # if ( + # rel.type_value != EPCRelsRelationshipType.DESTINATION_OBJECT.get_type() + # and rel.type_value != EPCRelsRelationshipType.SOURCE_OBJECT.get_type() + # and rel.type_value + # != EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES.get_type() + # ): # not a computable relation + # if additional_rels_key not in additional_rels: + # additional_rels[additional_rels_key] = [] + # additional_rels[additional_rels_key].append(rel) except AttributeError: logging.error(traceback.format_exc()) pass # 'CoreProperties' object has no attribute 'object_version' @@ -764,74 +1746,207 @@ def read_stream(cls, epc_file_io: BytesIO): # returns an Epc instance f" of a lack of a dependency module) " ) - return Epc( - energyml_objects=obj_list, + epc = Epc( + energyml_objects=EnergymlObjectCollection(obj_list), raw_files=raw_file_list, core_props=core_props, - additional_rels=additional_rels, + # additional_rels=additional_rels, ) + + # Populate rels cache from loaded rels files if requested + if read_rels_from_files and rels_files_to_load: + for obj_path, rels_file in rels_files_to_load.items(): + if obj_path in path_to_obj: + obj = path_to_obj[obj_path] + # Only set rels for energyml objects (skip CoreProperties and other OPC objects) + if obj in obj_list: + epc._rels_cache.set_rels_from_file(obj, rels_file) + + # Recompute relationships if requested + if recompute_rels: + epc._rels_cache.recompute_cache() + + return epc except zipfile.BadZipFile as error: logging.error(error) return None - def list_objects(self, dataspace: str | None = None, object_type: str | None = None) -> List[ResourceMetadata]: - result = [] - for obj in self.energyml_objects: - if (dataspace is None or get_obj_type(get_obj_usable_class(obj)) == dataspace) and ( - object_type is None or get_qualified_type_from_class(type(obj)) == object_type - ): - res_meta = ResourceMetadata( - uri=str(get_obj_uri(obj)), - uuid=get_obj_uuid(obj), - title=get_object_attribute(obj, "citation.title") or "", - object_type=type(obj).__name__, - version=get_obj_version(obj), - content_type=get_content_type_from_class(type(obj)) or "", - ) - result.append(res_meta) - return result + @classmethod + def read_stream_ultra_fast( + cls, epc_file_io: BytesIO, read_rels_from_files: bool = True, recompute_rels: bool = False + ) -> Optional["Epc"]: + from concurrent.futures import ProcessPoolExecutor, as_completed + import multiprocessing + + print("Reading EPC file parrallel v1...") + + obj_to_process = {} + rels_to_process = {} + raw_files = [] + core_props = None + + # 1. Lecture rapide et extraction des bytes du ZIP + with zipfile.ZipFile(epc_file_io, "r") as epc_file: + ct_path = get_epc_content_type_path() + content_type_obj = read_energyml_xml_bytes(epc_file.read(ct_path)) + + # Identification des types via le ContentTypes + energyml_paths = {} + for ov in content_type_obj.override: + path = ov.part_name.lstrip("/\\") + if is_energyml_content_type(ov.content_type): + energyml_paths[path] = ov.content_type + elif get_class_from_content_type(ov.content_type) == CoreProperties: + core_props = read_energyml_xml_bytes(epc_file.read(path), CoreProperties) + + # Extraction des contenus bruts + for info in epc_file.infolist(): + fname = info.filename + if fname in energyml_paths: + obj_to_process[fname] = (epc_file.read(fname), energyml_paths[fname]) + elif read_rels_from_files and fname.lower().endswith(".rels") and fname != "_rels/.rels": + rels_to_process[fname] = epc_file.read(fname) + elif ( + not fname.lower().endswith(".rels") + and not fname.lower().endswith(gen_core_props_path().lower()) + and fname not in energyml_paths + and fname != ct_path + ): + raw_files.append(RawFile(path=fname, content=BytesIO(epc_file.read(fname)))) + + # 2. Exécution Parallèle (Objets ET Rels) + path_to_obj = {} + obj_list = [] + rels_content_map = {} # {obj_path: Relationships_Object} + + cpus = multiprocessing.cpu_count() + with ProcessPoolExecutor(max_workers=cpus) as executor: + # A. On lance les objets + obj_futures = { + executor.submit(_parallel_xml_read, data, ct): path for path, (data, ct) in obj_to_process.items() + } + + # B. On lance les rels + rel_futures = { + executor.submit(_parallel_rels_read, r_data): r_path for r_path, r_data in rels_to_process.items() + } + + # C. Récupération des objets + for future in as_completed(obj_futures): + path = obj_futures[future] + res = future.result() + if not isinstance(res, Exception): + path_to_obj[path] = res + obj_list.append(res) + else: + logging.error(f"Erreur objet {path}: {res}") + + # D. Récupération des rels + for future in as_completed(rel_futures): + r_path = rel_futures[future] + res = future.result() + if not isinstance(res, Exception): + # Mapping rel_path -> obj_path + o_path = str(Path(r_path).parent.parent / Path(r_path).stem).replace("\\", "/") + rels_content_map[o_path] = res + else: + logging.error(f"Erreur rels {r_path}: {res}") - def put_object(self, obj: Any, dataspace: str | None = None) -> str | None: - if self.add_object(obj): - return str(get_obj_uri(obj)) - return None + # 3. Assemblage final dans le processus parent + epc = Epc(energyml_objects=EnergymlObjectCollection(obj_list), raw_files=raw_files, core_props=core_props) - def delete_object(self, identifier: Union[str, Any]) -> bool: - obj = self.get_object_by_identifier(identifier) - if obj is not None: - self.remove_object(identifier) - return True - return False + if read_rels_from_files: + for obj_path, rels_obj in rels_content_map.items(): + if obj_path in path_to_obj: + target_obj = path_to_obj[obj_path] + epc._rels_cache.set_rels_from_file(target_obj, rels_obj) - def get_array_metadata( - self, proxy: str | Uri | Any, path_in_external: str | None = None - ) -> DataArrayMetadata | List[DataArrayMetadata] | None: - array = self.read_array(proxy=proxy, path_in_external=path_in_external) - if array is not None: - if isinstance(array, np.ndarray): - return DataArrayMetadata.from_numpy_array(path_in_resource=path_in_external, array=array) - elif isinstance(array, list): - return DataArrayMetadata.from_list(path_in_resource=path_in_external, data=array) + if recompute_rels: + epc._rels_cache.recompute_cache() - def dumps_epc_content_and_files_lists(self) -> str: - """ - Dumps the EPC content and files lists for debugging purposes. - :return: A string representation of the EPC content and files lists. - """ - content_list = [ - f"{get_obj_identifier(obj)} ({get_qualified_type_from_class(type(obj))})" for obj in self.energyml_objects - ] - raw_files_list = [raw_file.path for raw_file in self.raw_files] + return epc - return "EPC Content:\n" + "\n".join(content_list) + "\n\nRaw Files:\n" + "\n".join(raw_files_list) + @classmethod + def read_stream_ultra_fast_v2( + cls, epc_file_io: BytesIO, read_rels_from_files: bool = True, recompute_rels: bool = False + ) -> Optional["Epc"]: + from concurrent.futures import ThreadPoolExecutor # Passage au ThreadPool + + print("Reading EPC file parrallel v2...") + + obj_list = [] + path_to_obj = {} + rels_content_map = {} + raw_files = [] + core_props = None + + # On utilise un ThreadPool pour éviter le coût de sérialisation Pickle + # lxml libère le GIL, donc c'est très efficace + with ThreadPoolExecutor() as executor: + futures = [] + + with zipfile.ZipFile(epc_file_io, "r") as epc_file: + # On récupère l'index d'abord + ct_path = get_epc_content_type_path() + content_type_obj = read_energyml_xml_bytes(epc_file.read(ct_path)) + + # Identification des types via le ContentTypes + energyml_paths = {} + for ov in content_type_obj.override: + path = ov.part_name.lstrip("/\\") + if is_energyml_content_type(ov.content_type): + energyml_paths[path] = ov.content_type + elif get_class_from_content_type(ov.content_type) == CoreProperties: + core_props = read_energyml_xml_bytes(epc_file.read(path), CoreProperties) + + for info in epc_file.infolist(): + fname = info.filename + + # STREAMING : On lance la tâche dès qu'on a les bytes + if fname in energyml_paths: + data = epc_file.read(fname) + f = executor.submit(_parallel_xml_read, data, energyml_paths[fname]) + futures.append((f, "OBJ", fname)) + + elif read_rels_from_files and fname.lower().endswith(".rels"): + data = epc_file.read(fname) + f = executor.submit(_parallel_rels_read, data) + futures.append((f, "REL", fname)) + elif ( + not fname.lower().endswith(".rels") + and not fname.lower().endswith(gen_core_props_path().lower()) + and fname not in energyml_paths + and fname != ct_path + ): + raw_files.append(RawFile(path=fname, content=BytesIO(epc_file.read(fname)))) + + # 2. Récupération des résultats (pendant que le ZIP continue d'être lu si possible) + for future, kind, path in futures: + res = future.result() + if isinstance(res, Exception): + continue + + if kind == "OBJ": + path_to_obj[path] = res + obj_list.append(res) + else: + o_path = str(Path(path).parent.parent / Path(path).stem).replace("\\", "/") + rels_content_map[o_path] = res - def close(self) -> None: - """ - Close the EPC file and release any resources. - :return: - """ - pass + # 3. Assemblage final dans le processus parent + epc = Epc(energyml_objects=EnergymlObjectCollection(obj_list), raw_files=raw_files, core_props=core_props) + + if read_rels_from_files: + for obj_path, rels_obj in rels_content_map.items(): + if obj_path in path_to_obj: + target_obj = path_to_obj[obj_path] + epc._rels_cache.set_rels_from_file(target_obj, rels_obj) # type: ignore + + if recompute_rels: + epc._rels_cache.recompute_cache() # type: ignore + + return epc # ______ __ ____ __ _ @@ -841,343 +1956,44 @@ def close(self) -> None: # /_____/_/ /_/\___/_/ \__, /\__, /_/ /_/ /_/_/ /_/ \__,_/_/ /_/\___/\__/_/\____/_/ /_/____/ # /____//____/ -""" -PropertyKind list: a list of Pre-defined properties -""" -__CACHE_PROP_KIND_DICT__ = {} - - -def update_prop_kind_dict_cache(): - prop_kind = get_property_kind_dict_path_as_dict() - - for prop in prop_kind["PropertyKind"]: - __CACHE_PROP_KIND_DICT__[prop["Uuid"]] = read_energyml_json_str(json.dumps(prop))[0] - - -def get_property_kind_by_uuid(uuid: str) -> Optional[Any]: - """ - Get a property kind by its uuid. - :param uuid: the uuid of the property kind - :return: the property kind or None if not found - """ - if len(__CACHE_PROP_KIND_DICT__) == 0: - # update the cache to check if it is a - try: - update_prop_kind_dict_cache() - except FileNotFoundError as e: - logging.error(f"Failed to parse propertykind dict {e}") - return __CACHE_PROP_KIND_DICT__.get(uuid, None) - - -def get_property_kind_and_parents(uuids: list) -> Dict[str, Any]: - """Get PropertyKind objects and their parents from a list of UUIDs. - - Args: - uuids (list): List of PropertyKind UUIDs. - - Returns: - Dict[str, Any]: A dictionary mapping UUIDs to PropertyKind objects and their parents. - """ - dict_props: Dict[str, Any] = {} - - for prop_uuid in uuids: - prop = get_property_kind_by_uuid(prop_uuid) - if prop is not None: - dict_props[prop_uuid] = prop - parent_uuid = get_object_attribute(prop, "parent.uuid") - if parent_uuid is not None and parent_uuid not in dict_props: - dict_props = get_property_kind_and_parents([parent_uuid]) | dict_props - else: - logging.warning(f"PropertyKind with UUID {prop_uuid} not found.") - continue - return dict_props - - -def as_dor(obj_or_identifier: Any, dor_qualified_type: str = "eml23.DataObjectReference"): - """ - Create an DOR from an object to target the latter. - :param obj_or_identifier: - :param dor_qualified_type: the qualified type of the DOR (e.g. "eml23.DataObjectReference" is the default value) - :return: - """ - dor = None - if obj_or_identifier is not None: - cls = get_class_from_qualified_type(dor_qualified_type) - dor = cls() - if isinstance(obj_or_identifier, str): # is an identifier or uri - parsed_uri = parse_uri(obj_or_identifier) - if parsed_uri is not None: - print(f"====> parsed uri {parsed_uri} : uuid is {parsed_uri.uuid}") - if hasattr(dor, "qualified_type"): - set_attribute_from_path(dor, "qualified_type", parsed_uri.get_qualified_type()) - if hasattr(dor, "content_type"): - set_attribute_from_path( - dor, "content_type", qualified_type_to_content_type(parsed_uri.get_qualified_type()) - ) - set_attribute_from_path(dor, "uuid", parsed_uri.uuid) - set_attribute_from_path(dor, "uid", parsed_uri.uuid) - if hasattr(dor, "object_version"): - set_attribute_from_path(dor, "object_version", parsed_uri.version) - if hasattr(dor, "version_string"): - set_attribute_from_path(dor, "version_string", parsed_uri.version) - if hasattr(dor, "energistics_uri"): - set_attribute_from_path(dor, "energistics_uri", obj_or_identifier) - - else: # identifier - if len(__CACHE_PROP_KIND_DICT__) == 0: - # update the cache to check if it is a - try: - update_prop_kind_dict_cache() - except FileNotFoundError as e: - logging.error(f"Failed to parse propertykind dict {e}") - try: - uuid, version = split_identifier(obj_or_identifier) - if uuid in __CACHE_PROP_KIND_DICT__: - return as_dor(__CACHE_PROP_KIND_DICT__[uuid]) - else: - set_attribute_from_path(dor, "uuid", uuid) - set_attribute_from_path(dor, "uid", uuid) - set_attribute_from_path(dor, "ObjectVersion", version) - except AttributeError: - logging.error(f"Failed to parse identifier {obj_or_identifier}. DOR will be empty") - else: - if is_dor(obj_or_identifier): - # If it is a dor, we create a dor conversionif hasattr(dor, "qualified_type"): - if hasattr(dor, "qualified_type"): - if hasattr(obj_or_identifier, "qualified_type"): - dor.qualified_type = get_object_attribute(obj_or_identifier, "qualified_type") - elif hasattr(obj_or_identifier, "content_type"): - dor.qualified_type = content_type_to_qualified_type( - get_object_attribute(obj_or_identifier, "content_type") - ) - - if hasattr(dor, "content_type"): - if hasattr(obj_or_identifier, "qualified_type"): - dor.content_type = qualified_type_to_content_type( - get_object_attribute(obj_or_identifier, "qualified_type") - ) - elif hasattr(obj_or_identifier, "content_type"): - dor.content_type = get_object_attribute(obj_or_identifier, "content_type") - - set_attribute_from_path(dor, "title", get_object_attribute(obj_or_identifier, "Title")) - set_attribute_from_path(dor, "uuid", get_obj_uuid(obj_or_identifier)) - set_attribute_from_path(dor, "uid", get_obj_uuid(obj_or_identifier)) - if hasattr(dor, "object_version"): - set_attribute_from_path(dor, "object_version", get_obj_version(obj_or_identifier)) - if hasattr(dor, "version_string"): - set_attribute_from_path(dor, "version_string", get_obj_version(obj_or_identifier)) - - else: - - # for etp Resource object: - if hasattr(obj_or_identifier, "uri"): - dor = as_dor(obj_or_identifier.uri, dor_qualified_type) - if hasattr(obj_or_identifier, "name"): - set_attribute_from_path(dor, "title", getattr(obj_or_identifier, "name")) - else: - if hasattr(dor, "qualified_type"): - try: - set_attribute_from_path( - dor, "qualified_type", get_qualified_type_from_class(obj_or_identifier) - ) - except Exception as e: - logging.error(f"Failed to set qualified_type for DOR {e}") - if hasattr(dor, "content_type"): - try: - set_attribute_from_path(dor, "content_type", get_content_type_from_class(obj_or_identifier)) - except Exception as e: - logging.error(f"Failed to set content_type for DOR {e}") - - set_attribute_from_path(dor, "title", get_object_attribute(obj_or_identifier, "Citation.Title")) - - set_attribute_from_path(dor, "uuid", get_obj_uuid(obj_or_identifier)) - set_attribute_from_path(dor, "uid", get_obj_uuid(obj_or_identifier)) - if hasattr(dor, "object_version"): - set_attribute_from_path(dor, "object_version", get_obj_version(obj_or_identifier)) - if hasattr(dor, "version_string"): - set_attribute_from_path(dor, "version_string", get_obj_version(obj_or_identifier)) - - return dor - - -def create_energyml_object( - content_or_qualified_type: str, - citation: Optional[Any] = None, - uuid: Optional[str] = None, -): - """ - Create an energyml object instance depending on the content-type or qualified-type given in parameter. - The SchemaVersion is automatically assigned. - If no citation is given default one will be used. - If no uuid is given, a random uuid will be used. - :param content_or_qualified_type: - :param citation: - :param uuid: - :return: - """ - if citation is None: - citation = { - "title": "New_Object", - "Creation": epoch_to_date(epoch()), - "LastUpdate": epoch_to_date(epoch()), - "Format": "energyml-utils", - "Originator": "energyml-utils python module", - } - cls = get_class_from_qualified_type(content_or_qualified_type) - obj = cls() - cit = get_obj_attribute_class(cls, "citation")() - copy_attributes( - obj_in=citation, - obj_out=cit, - only_existing_attributes=True, - ignore_case=True, - ) - set_attribute_from_path(obj, "citation", cit) - set_attribute_value(obj, "uuid", uuid or gen_uuid()) - set_attribute_value(obj, "SchemaVersion", get_class_pkg_version(obj)) - - return obj - - -def create_external_part_reference( - eml_version: str, - h5_file_path: str, - citation: Optional[Any] = None, - uuid: Optional[str] = None, -): - """ - Create an EpcExternalPartReference depending on the energyml version (should be ["2.0", "2.1", "2.2"]). - The MimeType, ExistenceKind and Filename will be automatically filled. - :param eml_version: - :param h5_file_path: - :param citation: - :param uuid: - :return: - """ - version_flat = OptimizedRegex.DOMAIN_VERSION.findall(eml_version)[0][0].replace(".", "").replace("_", "") - obj = create_energyml_object( - content_or_qualified_type="eml" + version_flat + ".EpcExternalPartReference", - citation=citation, - uuid=uuid, - ) - set_attribute_value(obj, "MimeType", MimeType.HDF5.value) - set_attribute_value(obj, "ExistenceKind", "Actual") - set_attribute_value(obj, "Filename", h5_file_path) - - return obj - - -def get_reverse_dor_list(obj_list: List[Any], key_func: Callable = get_obj_identifier) -> Dict[str, List[Any]]: - """ - Compute a dict with 'OBJ_UUID.OBJ_VERSION' as Key, and list of DOR that reference it. - If the object version is None, key is 'OBJ_UUID.' - :param obj_list: - :param key_func: a callable to create the key of the dict from the object instance - :return: str - """ - rels = {} - for obj in obj_list: - for dor in search_attribute_matching_type(obj, "DataObjectReference", return_self=False): - key = key_func(dor) - if key not in rels: - rels[key] = [] - rels[key] = rels.get(key, []) + [obj] - return rels - - -# PATHS - - -def gen_core_props_path( - export_version: EpcExportVersion = EpcExportVersion.CLASSIC, -): - return "docProps/core.xml" - - -def gen_energyml_object_path( - energyml_object: Union[str, Any], - export_version: EpcExportVersion = EpcExportVersion.CLASSIC, -): - """ - Generate a path to store the :param:`energyml_object` into an epc file (depending on the :param:`export_version`) - :param energyml_object: - :param export_version: - :return: - """ - if isinstance(energyml_object, str): - energyml_object = read_energyml_xml_str(energyml_object) - - obj_type = get_object_type_for_file_path_from_class(energyml_object.__class__) - # logging.debug("is_dor: ", str(is_dor(energyml_object)), "object type : " + str(obj_type)) - - if is_dor(energyml_object): - uuid, pkg, pkg_version, obj_cls, object_version = get_dor_obj_info(energyml_object) - obj_type = get_object_type_for_file_path_from_class(obj_cls) - else: - pkg = get_class_pkg(energyml_object) - pkg_version = get_class_pkg_version(energyml_object) - object_version = get_obj_version(energyml_object) - uuid = get_obj_uuid(energyml_object) - - if export_version == EpcExportVersion.EXPANDED: - return f"namespace_{pkg}{pkg_version.replace('.', '')}/{(('version_' + object_version + '/') if object_version is not None and len(object_version) > 0 else '')}{obj_type}_{uuid}.xml" - else: - return obj_type + "_" + uuid + ".xml" - - -def get_file_folder_and_name_from_path(path: str) -> Tuple[str, str]: - """ - Returns a tuple (FOLDER_PATH, FILE_NAME) - :param path: - :return: - """ - obj_folder = path[: path.rindex("/") + 1] if "/" in path else "" - obj_file_name = path[path.rindex("/") + 1 :] if "/" in path else path - return obj_folder, obj_file_name +# Backward compatibility: re-export functions that were moved to epc_utils +# This allows existing code that imports these functions from epc.py to continue working +from energyml.utils.epc_utils import ( + create_default_core_properties, + create_default_types, + create_external_relationship, + gen_rels_path_from_obj_path, + get_dor_or_external_uris_from_obj, + get_dor_uris_from_obj, + get_epc_content_type_rels_path, + get_rels_dor_type, + in_epc_file_path_to_mime_type, + is_core_prop_or_extension_path, + update_prop_kind_dict_cache, + get_property_kind_by_uuid, + get_property_kind_and_parents, + as_dor, + create_energyml_object, + create_external_part_reference, + get_reverse_dor_list, + get_file_folder_and_name_from_path, +) +# Also export the cache dict for backward compatibility +from energyml.utils.epc_utils import __CACHE_PROP_KIND_DICT__ -def gen_rels_path( - energyml_object: Any, - export_version: EpcExportVersion = EpcExportVersion.CLASSIC, -) -> str: - """ - Generate a path to store the :param:`energyml_object` rels file into an epc file - (depending on the :param:`export_version`) - :param energyml_object: - :param export_version: - :return: - """ - if isinstance(energyml_object, CoreProperties): - return f"{RELS_FOLDER_NAME}/.rels" - else: - obj_path = gen_energyml_object_path(energyml_object, export_version) - obj_folder, obj_file_name = get_file_folder_and_name_from_path(obj_path) - return f"{obj_folder}{RELS_FOLDER_NAME}/{obj_file_name}.rels" +__all__ = [ + "Epc", + "update_prop_kind_dict_cache", + "get_property_kind_by_uuid", + "get_property_kind_and_parents", + "as_dor", + "create_energyml_object", + "create_external_part_reference", + "get_reverse_dor_list", + "get_file_folder_and_name_from_path", + "__CACHE_PROP_KIND_DICT__", +] # def gen_rels_path_from_dor(dor: Any, export_version: EpcExportVersion = EpcExportVersion.CLASSIC) -> str: - - -def get_epc_content_type_path( - export_version: EpcExportVersion = EpcExportVersion.CLASSIC, -) -> str: - """ - Generate a path to store the "[Content_Types].xml" file into an epc file - (depending on the :param:`export_version`) - :return: - """ - return "[Content_Types].xml" - - -def create_h5_external_relationship(h5_path: str, current_idx: int = 0) -> Relationship: - """ - Create a Relationship object to link an external HDF5 file. - :param h5_path: - :return: - """ - return Relationship( - target=h5_path, - type_value=EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(), - id=f"Hdf5File{current_idx + 1 if current_idx > 0 else ''}", - target_mode=TargetMode.EXTERNAL, - ) diff --git a/energyml-utils/src/energyml/utils/epc_stream.py b/energyml-utils/src/energyml/utils/epc_stream.py index 6c8686a..5d17dd0 100644 --- a/energyml-utils/src/energyml/utils/epc_stream.py +++ b/energyml-utils/src/energyml/utils/epc_stream.py @@ -8,44 +8,104 @@ content into memory at once. """ +import atexit +from datetime import datetime import tempfile +import traceback +import numpy as np import shutil import logging import os +import re import zipfile +from enum import Enum from contextlib import contextmanager from dataclasses import dataclass from pathlib import Path -from typing import Dict, List, Optional, Any, Iterator, Union, Tuple, TypedDict +from typing import Dict, List, Optional, Any, Iterator, Set, Union, Tuple, TypedDict from weakref import WeakValueDictionary -from energyml.opc.opc import Types, Override, CoreProperties, Relationships, Relationship -from energyml.utils.data.datasets_io import HDF5FileReader, HDF5FileWriter -from energyml.utils.storage_interface import DataArrayMetadata, EnergymlStorageInterface, ResourceMetadata -from energyml.utils.uri import Uri, parse_uri -import h5py -import numpy as np +from energyml.opc.opc import ( + Types, + Override, + CoreProperties, + Relationships, + Relationship, +) +from energyml.utils.data.datasets_io import ( + # FileCacheManager, + get_handler_registry, +) +from energyml.utils.epc_utils import ( + EXPANDED_EXPORT_FOLDER_PREFIX, + create_default_core_properties, + create_default_types, + create_mandatory_structure_epc, + extract_uuid_and_version_from_obj_path, + gen_core_props_rels_path, + gen_rels_path_from_obj_path, + get_dor_uris_from_obj, + get_rels_dor_type, + in_epc_file_path_to_mime_type, + is_core_prop_or_extension_path, + repair_epc_structure_if_not_valid, + get_file_folder, +) +from energyml.utils.storage_interface import ( + DataArrayMetadata, + EnergymlStorageInterface, + ResourceMetadata, + create_resource_metadata_from_uri, +) +from energyml.utils.uri import Uri, create_uri_from_content_type_or_qualified_type from energyml.utils.constants import ( EPCRelsRelationshipType, - OptimizedRegex, EpcExportVersion, - content_type_to_qualified_type, + MimeType, + OptimizedRegex, + date_to_datetime, +) +from energyml.utils.epc_utils import ( + gen_energyml_object_path, + get_epc_content_type_path, + gen_core_props_path, + make_path_relative_to_filepath_list, + make_path_relative_to_other_file, + as_identifier, ) -from energyml.utils.epc import Epc, gen_energyml_object_path, gen_rels_path, get_epc_content_type_path from energyml.utils.introspection import ( get_class_from_content_type, - get_obj_content_type, + get_content_type_from_class, get_obj_identifier, - get_obj_uuid, - get_object_type_for_file_path_from_class, + get_obj_title, + get_obj_uri, + get_object_attribute_advanced, get_direct_dor_list, get_obj_type, get_obj_usable_class, + gen_uuid, ) from energyml.utils.serialization import read_energyml_xml_bytes, serialize_xml -from .xml import is_energyml_content_type -from enum import Enum + +from energyml.utils.xml_utils import is_energyml_content_type + + +def get_dor_identifiers_from_obj(obj: Any) -> Set[str]: + """Get identifiers of all Data Object References (DORs) directly referenced by the given object.""" + identifiers = set() + try: + dor_list = get_direct_dor_list(obj) + for dor in dor_list: + try: + identifier = get_obj_identifier(dor) + if identifier: + identifiers.add(identifier) + except Exception as e: + logging.warning(f"Failed to extract identifier from DOR: {e}") + except Exception as e: + logging.warning(f"Failed to get DOR list from object: {e}") + return identifiers class RelsUpdateMode(Enum): @@ -72,17 +132,53 @@ class RelsUpdateMode(Enum): class EpcObjectMetadata: """Metadata for an object in the EPC file.""" - uuid: str - object_type: str - content_type: str - file_path: str - identifier: Optional[str] = None - version: Optional[str] = None + uri: Uri + + title: Optional[str] = None + custom_data: Optional[Dict[str, Any]] = None + last_changed: Optional[datetime] = None def __post_init__(self): - if self.identifier is None: - # Generate identifier if not provided - object.__setattr__(self, "identifier", f"{self.uuid}.{self.version or ''}") + if not self.uri.is_object_uri(): + raise ValueError(f"URI must be an object URI: {self.uri}") + + @property + def uuid(self) -> str: + return self.uri.uuid # type: ignore Guaranteed to be non-None for object URIs due to __post_init__ validation + + @property + def object_type(self) -> str: + return self.uri.object_type # type: ignore Guaranteed to be non-None for object URIs due to __post_init__ validation + + @property + def content_type(self) -> str: + return self.uri.get_content_type() + + @property + def qualified_type(self) -> str: + return self.uri.get_qualified_type() + + @property + def version(self) -> Optional[str]: + return self.uri.version + + @property + def identifier(self) -> str: + return self.uri.as_identifier() + + def file_path(self, export_version: EpcExportVersion) -> str: + return gen_energyml_object_path(self.uri, export_version=export_version) + + def rels_path(self, export_version: EpcExportVersion) -> str: + return gen_rels_path_from_obj_path(self.file_path(export_version=export_version)) + + def __str__(self): + return str(self.uri) + + def to_resource_metadata(self) -> ResourceMetadata: + return create_resource_metadata_from_uri( + self.uri, title=self.title, custom_data=self.custom_data, last_changed=self.last_changed + ) @dataclass @@ -120,16 +216,18 @@ class _WorkerResult(TypedDict): """Type definition for parallel worker function return value.""" identifier: str + file_path: str object_type: str - source_rels: List[Dict[str, str]] - dor_targets: List[Tuple[str, str]] + referenced_objects: List[Tuple[str, str]] # List of (target_identifier, target_type) -def _process_object_for_rels_worker(args: Tuple[str, str, Dict[str, EpcObjectMetadata]]) -> Optional[_WorkerResult]: +def process_object_for_rels_worker( + args: Tuple[str, str, Dict[str, EpcObjectMetadata]], export_version: EpcExportVersion +) -> Optional[_WorkerResult]: """ Worker function for parallel relationship processing (runs in separate process). - This function is executed in a separate process to compute SOURCE relationships + This function is executed in a separate process to compute DESTINATION relationships for a single object. It bypasses Python's GIL for CPU-intensive XML parsing. Performance characteristics: @@ -142,74 +240,57 @@ def _process_object_for_rels_worker(args: Tuple[str, str, Dict[str, EpcObjectMet - identifier: Object UUID/identifier to process - epc_file_path: Absolute path to the EPC file - metadata_dict: Dictionary of all object metadata (for validation) + export_version: Version of EPC export format to use Returns: - Dictionary conforming to _WorkerResult TypedDict, or None if processing fails. + Dictionary conforming to _WorkerResult TypedDict with the following keys: + - 'identifier': The identifier of the processed object + - 'file_path': The file path of the object within the EPC archive + - 'object_type': The type of the object (e.g., 'BoundaryFeature', 'TriangulatedSetRepresentation') + - 'referenced_objects': List of tuples (target_identifier, target_type) for all + Data Object References (DORs) found in this object that exist in the EPC + Returns None if processing fails (e.g., object not found, parsing error). """ identifier, epc_file_path, metadata_dict = args try: # Open ZIP file in this worker process - import zipfile - from energyml.utils.serialization import read_energyml_xml_bytes - from energyml.utils.introspection import ( - get_direct_dor_list, - get_obj_identifier, - get_obj_type, - get_obj_usable_class, - ) - from energyml.utils.constants import EPCRelsRelationshipType - from energyml.utils.introspection import get_class_from_content_type - metadata = metadata_dict.get(identifier) if not metadata: return None # Load object from ZIP with zipfile.ZipFile(epc_file_path, "r") as zf: - obj_data = zf.read(metadata.file_path) + obj_data = zf.read(metadata.file_path(export_version=export_version)) obj_class = get_class_from_content_type(metadata.content_type) obj = read_energyml_xml_bytes(obj_data, obj_class) - # Extract object type (cached to avoid reloading in Phase 3) - obj_type = get_obj_type(get_obj_usable_class(obj)) + # Extract this object's type from metadata (no need to parse object) + obj_type = metadata.object_type - # Get all Data Object References (DORs) from this object - data_object_references = get_direct_dor_list(obj) + # Get all DOR URIs - URIs contain all necessary info (type, uuid, version) + dor_uris = get_dor_uris_from_obj(obj) - # Build SOURCE relationships and track referenced objects - source_rels = [] - dor_targets = [] # Track (target_id, target_type) for reverse references - - for dor in data_object_references: + # Build list of (target_identifier, target_type) tuples from URIs + referenced_objects = [] + for uri in dor_uris: try: - target_identifier = get_obj_identifier(dor) - if target_identifier not in metadata_dict: - continue - - target_metadata = metadata_dict[target_identifier] - - # Extract target type (needed for relationship ID) - target_type = get_obj_type(get_obj_usable_class(dor)) - dor_targets.append((target_identifier, target_type)) - - # Serialize relationship as dict (Relationship objects aren't picklable) - rel_dict = { - "target": target_metadata.file_path, - "type_value": EPCRelsRelationshipType.SOURCE_OBJECT.get_type(), - "id": f"_{identifier}_{target_type}_{target_identifier}", - } - source_rels.append(rel_dict) - + target_identifier = uri.as_identifier() + # Only include if target exists in metadata + if target_identifier and target_identifier in metadata_dict: + # Extract type directly from URI (no need to load target object) + target_type = uri.object_type + if target_type: + referenced_objects.append((target_identifier, target_type)) except Exception as e: - # Don't fail entire object processing for one bad DOR - logging.debug(f"Skipping invalid DOR in {identifier}: {e}") + # Don't fail entire object for one bad DOR + logging.debug(f"Skipping invalid DOR URI in {identifier}: {e}") return { "identifier": identifier, + "file_path": metadata.file_path(export_version=export_version), "object_type": obj_type, - "source_rels": source_rels, - "dor_targets": dor_targets, + "referenced_objects": referenced_objects, } except Exception as e: @@ -289,6 +370,20 @@ def close(self) -> None: finally: self._persistent_zip = None + def __del__(self): + """Ensure the persistent ZIP file is closed when the accessor is garbage collected.""" + try: + self.close() + except Exception: + pass + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + return False + class _MetadataManager: """ @@ -315,11 +410,14 @@ def __init__(self, zip_accessor: _ZipFileAccessor, stats: EpcStreamingStats): # Object metadata storage self._metadata: Dict[str, EpcObjectMetadata] = {} # identifier -> metadata self._uuid_index: Dict[str, List[str]] = {} # uuid -> list of identifiers - self._type_index: Dict[str, List[str]] = {} # object_type -> list of identifiers self._core_props: Optional[CoreProperties] = None - self._core_props_path: Optional[str] = None + self._export_version = EpcExportVersion.CLASSIC # Store export version, default set to CLASSIC + + def set_export_version(self, version: EpcExportVersion) -> None: + """Set the export version.""" + self._export_version = version - def load_metadata(self) -> None: + def load_metadata(self, detect_export_version: bool = True) -> None: """Load object metadata from [Content_Types].xml without loading actual objects.""" try: with self.zip_accessor.get_zip_file() as zf: @@ -333,6 +431,21 @@ def load_metadata(self) -> None: self._process_energyml_object_metadata(zf, override) elif self._is_core_properties(override.content_type): self._process_core_properties_metadata(override) + else: + logging.debug( + f"Epc_StreamReader @load_metadata Skipping non-EnergyML content type: {override.content_type}" + ) + + # checking export version + if ( + detect_export_version + and self._export_version == EpcExportVersion.CLASSIC + and override.part_name.startswith( + (EXPANDED_EXPORT_FOLDER_PREFIX, f"/{EXPANDED_EXPORT_FOLDER_PREFIX}") + ) + ): + logging.debug(f"Detected EXPANDED EPC version based on path: {override.part_name}") + self._export_version = EpcExportVersion.EXPANDED self.stats.total_objects = len(self._metadata) @@ -340,137 +453,32 @@ def load_metadata(self) -> None: logging.error(f"Failed to load metadata from EPC file: {e}") raise - def _read_content_types(self, zf: zipfile.ZipFile) -> Types: - """Read and parse [Content_Types].xml file.""" - content_types_path = get_epc_content_type_path() - - try: - content_data = zf.read(content_types_path) - self.stats.bytes_read += len(content_data) - return read_energyml_xml_bytes(content_data, Types) - except KeyError: - # Try case-insensitive search - for name in zf.namelist(): - if name.lower() == content_types_path.lower(): - content_data = zf.read(name) - self.stats.bytes_read += len(content_data) - return read_energyml_xml_bytes(content_data, Types) - raise FileNotFoundError("No [Content_Types].xml found in EPC file") - - def _process_energyml_object_metadata(self, zf: zipfile.ZipFile, override: Override) -> None: - """Process metadata for an EnergyML object without loading it.""" - if not override.part_name or not override.content_type: - return - - file_path = override.part_name.lstrip("/") - content_type = override.content_type - - try: - # Quick peek to extract UUID and version without full parsing - uuid, version, obj_type = self._extract_object_info_fast(zf, file_path, content_type) - - if uuid: # Only process if we successfully extracted UUID - metadata = EpcObjectMetadata( - uuid=uuid, object_type=obj_type, content_type=content_type, file_path=file_path, version=version - ) - - # Store in indexes - identifier = metadata.identifier - if identifier: - self._metadata[identifier] = metadata - - # Update UUID index - if uuid not in self._uuid_index: - self._uuid_index[uuid] = [] - self._uuid_index[uuid].append(identifier) - - # Update type index - if obj_type not in self._type_index: - self._type_index[obj_type] = [] - self._type_index[obj_type].append(identifier) - - except Exception as e: - logging.warning(f"Failed to process metadata for {file_path}: {e}") - - def _extract_object_info_fast( - self, zf: zipfile.ZipFile, file_path: str, content_type: str - ) -> Tuple[Optional[str], Optional[str], str]: - """Fast extraction of UUID and version from XML without full parsing.""" - try: - # Read only the beginning of the file for UUID extraction - with zf.open(file_path) as f: - # Read first chunk (usually sufficient for root element) - chunk = f.read(2048) # 2KB should be enough for root element - self.stats.bytes_read += len(chunk) - - chunk_str = chunk.decode("utf-8", errors="ignore") - - # Extract UUID using optimized regex - uuid_match = OptimizedRegex.UUID_NO_GRP.search(chunk_str) - uuid = uuid_match.group(0) if uuid_match else None - - # Extract version if present - version = None - version_patterns = [ - r'object[Vv]ersion["\']?\s*[:=]\s*["\']([^"\']+)', - ] - - for pattern in version_patterns: - import re - - version_match = re.search(pattern, chunk_str) - if version_match: - version = version_match.group(1) - # Ensure version is a string - if not isinstance(version, str): - version = str(version) - break - - # Extract object type from content type - obj_type = self._extract_object_type_from_content_type(content_type) - - return uuid, version, obj_type - - except Exception as e: - logging.debug(f"Fast extraction failed for {file_path}: {e}") - return None, None, "Unknown" - - def _extract_object_type_from_content_type(self, content_type: str) -> str: - """Extract object type from content type string.""" - try: - match = OptimizedRegex.CONTENT_TYPE.search(content_type) - if match: - return match.group("type") - except (AttributeError, KeyError): - pass - return "Unknown" - - def _is_core_properties(self, content_type: str) -> bool: - """Check if content type is CoreProperties.""" - return content_type == "application/vnd.openxmlformats-package.core-properties+xml" - - def _process_core_properties_metadata(self, override: Override) -> None: - """Process core properties metadata.""" - if override.part_name: - self._core_props_path = override.part_name.lstrip("/") - def get_metadata(self, identifier: str) -> Optional[EpcObjectMetadata]: """Get metadata for an object by identifier.""" return self._metadata.get(identifier) - def get_by_uuid(self, uuid: str) -> List[str]: - """Get all identifiers for objects with the given UUID.""" + def get_uuid_identifiers(self, uuid: str) -> List[str]: + """Get all identifiers for objects with the given UUID. + Note: Multiple objects can share the same UUID if there are multiple versions of the same object in the EPC file. + """ return self._uuid_index.get(uuid, []) - def get_by_type(self, object_type: str) -> List[str]: - """Get all identifiers for objects of the given type.""" - return self._type_index.get(object_type, []) + def get_by_qualified_type(self, qualified_type: str) -> List[str]: + """Get all identifiers for objects of the given qualified type.""" + return [m.identifier for m in self._metadata.values() if m.qualified_type == qualified_type] - def list_metadata(self, object_type: Optional[str] = None) -> List[EpcObjectMetadata]: + def list_metadata(self, qualified_type_filter: Optional[str] = None) -> List[EpcObjectMetadata]: """List metadata for all objects, optionally filtered by type.""" - if object_type is None: + if qualified_type_filter is None: return list(self._metadata.values()) - return [self._metadata[identifier] for identifier in self._type_index.get(object_type, [])] + # print(f"Filtering metadata for qualified type: {qualified_type_filter} -- {len(self._metadata.values())}") + # for identifier in self._metadata.keys(): + # print(f"{identifier} with qualified type {self._metadata[identifier].uri.get_qualified_type()}") + return [ + self._metadata[identifier] + for identifier in self._metadata + if self._metadata[identifier].uri.get_qualified_type() == qualified_type_filter + ] def add_metadata(self, metadata: EpcObjectMetadata) -> None: """Add metadata for a new object.""" @@ -481,17 +489,15 @@ def add_metadata(self, metadata: EpcObjectMetadata) -> None: # Update UUID index if metadata.uuid not in self._uuid_index: self._uuid_index[metadata.uuid] = [] - self._uuid_index[metadata.uuid].append(identifier) - - # Update type index - if metadata.object_type not in self._type_index: - self._type_index[metadata.object_type] = [] - self._type_index[metadata.object_type].append(identifier) + if identifier not in self._uuid_index[metadata.uuid]: + self._uuid_index[metadata.uuid].append(identifier) self.stats.total_objects += 1 - def remove_metadata(self, identifier: str) -> Optional[EpcObjectMetadata]: + def remove_metadata(self, identifier: Union[str, EpcObjectMetadata]) -> Optional[EpcObjectMetadata]: """Remove metadata for an object. Returns the removed metadata.""" + if isinstance(identifier, EpcObjectMetadata): + identifier = identifier.identifier metadata = self._metadata.pop(identifier, None) if metadata: # Update UUID index @@ -500,12 +506,6 @@ def remove_metadata(self, identifier: str) -> Optional[EpcObjectMetadata]: if not self._uuid_index[metadata.uuid]: del self._uuid_index[metadata.uuid] - # Update type index - if metadata.object_type in self._type_index: - self._type_index[metadata.object_type].remove(identifier) - if not self._type_index[metadata.object_type]: - del self._type_index[metadata.object_type] - self.stats.total_objects -= 1 return metadata @@ -514,35 +514,23 @@ def contains(self, identifier: str) -> bool: """Check if an object with the given identifier exists.""" return identifier in self._metadata - def __len__(self) -> int: - """Return total number of objects.""" - return len(self._metadata) - - def __iter__(self) -> Iterator[str]: - """Iterate over object identifiers.""" - return iter(self._metadata.keys()) - def gen_rels_path_from_metadata(self, metadata: EpcObjectMetadata) -> str: """Generate rels path from object metadata without loading the object.""" - obj_path = metadata.file_path - # Extract folder and filename from the object path - if "/" in obj_path: - obj_folder = obj_path[: obj_path.rindex("/") + 1] - obj_file_name = obj_path[obj_path.rindex("/") + 1 :] - else: - obj_folder = "" - obj_file_name = obj_path - - return f"{obj_folder}_rels/{obj_file_name}.rels" + if not isinstance(metadata, EpcObjectMetadata): + raise ValueError("Metadata must be an instance of EpcObjectMetadata") + return metadata.rels_path(export_version=self._export_version) def gen_rels_path_from_identifier(self, identifier: str) -> Optional[str]: """Generate rels path from object identifier without loading the object.""" + if not isinstance(identifier, str): + raise ValueError("Identifier must be a string") metadata = self._metadata.get(identifier) if metadata is None: return None return self.gen_rels_path_from_metadata(metadata) - def get_core_properties(self) -> Optional[CoreProperties]: + @property + def core_properties(self) -> Optional[CoreProperties]: """Get core properties (loaded lazily).""" if self._core_props is None and self._core_props_path: try: @@ -551,10 +539,61 @@ def get_core_properties(self) -> Optional[CoreProperties]: self.stats.bytes_read += len(core_data) self._core_props = read_energyml_xml_bytes(core_data, CoreProperties) except Exception as e: - logging.error(f"Failed to load core properties: {e}") + logging.error(f"Failed to load core properties, creating a default one: {e}") + self._core_props = create_default_core_properties() return self._core_props + @core_properties.setter + def core_properties(self, core_props: CoreProperties) -> None: + """Set core properties (updates immediately in the EPC zip file).""" + self._core_props = core_props + self._write_core_properties_to_zip(core_props) + + def _write_core_properties_to_zip(self, core_props: CoreProperties) -> None: + """Write core properties to the EPC zip file, replacing existing ones.""" + core_props_path = gen_core_props_path() + temp_path = None + + try: + # Create a temporary file for the new zip + with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: + temp_path = temp_file.name + + # Create new zip and copy all content except old core properties + with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zf: + with self.zip_accessor.get_zip_file() as source_zf: + # Copy all files except the core properties + for item in source_zf.infolist(): + if item.filename != core_props_path: + data = source_zf.read(item.filename) + target_zf.writestr(item, data) + + # Write new core properties + core_props_xml = serialize_xml(core_props) + zip_info = zipfile.ZipInfo( + filename=core_props_path, + date_time=datetime.now().timetuple()[:6], + ) + target_zf.writestr(zip_info, core_props_xml) + + # Replace the original file + shutil.move(temp_path, self.zip_accessor.epc_file_path) + + # Reopen the zip file to reflect changes + self.zip_accessor.reopen_persistent_zip() + + logging.info(f"Successfully updated core properties in {self.zip_accessor.epc_file_path}") + + except Exception as e: + # Clean up temp file if it exists + if temp_path and os.path.exists(temp_path): + try: + os.remove(temp_path) + except Exception: + pass + raise IOError(f"Failed to write core properties to EPC: {e}") + def detect_epc_version(self) -> EpcExportVersion: """Detect EPC packaging version based on file structure.""" try: @@ -586,36 +625,145 @@ def detect_epc_version(self) -> EpcExportVersion: logging.warning(f"Failed to detect EPC version, defaulting to CLASSIC: {e}") return EpcExportVersion.CLASSIC - def update_content_types_xml( - self, source_zip: zipfile.ZipFile, metadata: EpcObjectMetadata, add: bool = True - ) -> str: - """Update [Content_Types].xml to add or remove object entry. + def get_content_type(self, zf: zipfile.ZipFile) -> Types: - Args: - source_zip: Open ZIP file to read from - metadata: Object metadata - add: If True, add entry; if False, remove entry + meta_dict_key_path = { + m.file_path(export_version=self._export_version): m.content_type for m in self._metadata.values() + } + other_files_in_epc = set() + for name in zf.namelist(): + if name not in meta_dict_key_path and not name.endswith("rels") and not name == get_epc_content_type_path(): + other_files_in_epc.add(name) + + content_types = create_default_types() + + # creating overrides + for file_path, content_type in meta_dict_key_path.items(): + override = Override(content_type=content_type, part_name=f"/{file_path}") + content_types.override.append(override) + + # Add overrides for other files in EPC that are not in metadata (to preserve them) + for file_path in other_files_in_epc: + # file_extension = os.path.splitext(file_path)[1].lstrip(".").lower() + mime_type = in_epc_file_path_to_mime_type(file_path) + if mime_type: + override = Override(content_type=mime_type, part_name=f"/{file_path}") + content_types.override.append(override) + + return content_types + + # ____ ____ _____ _____ ____________ __ _________________ ______ ____ _____ + # / __ \/ __ \/ _/ | / / |/_ __/ ____/ / |/ / ____/_ __/ / / / __ \/ __ \/ ___/ + # / /_/ / /_/ // / | | / / /| | / / / __/ / /|_/ / __/ / / / /_/ / / / / / / /\__ \ + # / ____/ _, _// / | |/ / ___ |/ / / /___ / / / / /___ / / / __ / /_/ / /_/ /___/ / + # /_/ /_/ |_/___/ |___/_/ |_/_/ /_____/ /_/ /_/_____/ /_/ /_/ /_/\____/_____//____/ - Returns: - Updated [Content_Types].xml as string - """ - # Read existing content types - content_types = self._read_content_types(source_zip) - - if add: - # Add new override entry - new_override = Override() - new_override.part_name = f"/{metadata.file_path}" - new_override.content_type = metadata.content_type - content_types.override.append(new_override) - else: - # Remove override entry - content_types.override = [ - override for override in content_types.override if override.part_name != f"/{metadata.file_path}" - ] + def _read_content_types(self, zf: zipfile.ZipFile) -> Types: + """Read and parse [Content_Types].xml file.""" + content_types_path = get_epc_content_type_path() + + try: + content_data = zf.read(content_types_path) + self.stats.bytes_read += len(content_data) + return read_energyml_xml_bytes(content_data, Types) + except KeyError: + # Try case-insensitive search + for name in zf.namelist(): + if name.lower() == content_types_path.lower(): + content_data = zf.read(name) + self.stats.bytes_read += len(content_data) + return read_energyml_xml_bytes(content_data, Types) + raise FileNotFoundError(f"No {content_types_path} found in EPC file") + + def _process_energyml_object_metadata(self, zf: zipfile.ZipFile, override: Override) -> None: + """Process metadata for an EnergyML object without loading it.""" + if not override.part_name or not override.content_type: + return + + file_path = override.part_name.lstrip("/") + content_type = override.content_type + + uuid, version, title, last_changed = None, None, None, None + + try: + # First try to extract UUID and version from file path (works for EXPANDED mode) + uuid, version = extract_uuid_and_version_from_obj_path(file_path) + + # For CLASSIC mode, version is not in the path, so we need to extract it from XML content + # => Finally I do it anyway to get the title. + try: + # Read first chunk of XML to extract version, title, and last_changed in one regex search + with zf.open(file_path) as f: + chunk = f.read() # 4KB to increase chance of catching citation block + # chunk = f.read(4096) # 4KB to increase chance of catching citation block + self.stats.bytes_read += len(chunk) + chunk_str = chunk.decode("utf-8", errors="ignore") + + # Single regex with named groups for version, title, and last_changed + pattern = re.compile( + r'object[Vv]ersion["\']?\s*[:=]\s*["\'](?P[^"\']+)' # version attribute + r"|(?P.*?)</eml:Title>" # eml:Title tag + r"|<eml:LastUpdate>(?P<last_changed>.*?)</eml:LastUpdate>", + re.DOTALL, + ) + + # Iterate all matches and assign the first found for each group + found = {"version": None, "title": None, "last_changed": None} + for match in pattern.finditer(chunk_str): + for key in found: + if found[key] is None and match.group(key) is not None: + found[key] = match.group(key).strip() + if version is None and found["version"] is not None: + version = found["version"] + if found["title"] is not None: + title = found["title"] + if found["last_changed"] is not None: + last_changed = found["last_changed"] + # Try to parse as datetime if possible + try: + last_changed = date_to_datetime(last_changed) + except Exception: + pass + + except Exception as e: + logging.debug(f"Failed to extract version/title/last_update from XML content for {file_path}: {e}") + + if uuid: # Only process if we successfully extracted UUID + uri = create_uri_from_content_type_or_qualified_type(ct_or_qt=content_type, uuid=uuid, version=version) + + # print(f"Loaded metadata for {uri} ({type(uri)}) with title '{title}' and last changed '{last_changed}'") + metadata = EpcObjectMetadata(uri=uri, title=title, last_changed=last_changed) + + # Store in indexes + identifier = metadata.identifier + if identifier: + self._metadata[identifier] = metadata + + # Update UUID index + if uuid not in self._uuid_index: + self._uuid_index[uuid] = [] + self._uuid_index[uuid].append(identifier) + + except Exception as e: + traceback.print_exc() + logging.warning(f"Failed to process metadata for {file_path}: {e}") + + def _is_core_properties(self, content_type: str) -> bool: + """Check if content type is CoreProperties.""" + return content_type == MimeType.CORE_PROPERTIES.value + + def _process_core_properties_metadata(self, override: Override) -> None: + """Process core properties metadata.""" + if override.part_name: + self._core_props_path = override.part_name.lstrip("/") + + def __len__(self) -> int: + """Return total number of objects.""" + return len(self._metadata) - # Serialize back to XML - return serialize_xml(content_types) + def __iter__(self) -> Iterator[str]: + """Iterate over object identifiers.""" + return iter(self._metadata.keys()) class _RelationshipManager: @@ -623,7 +771,7 @@ class _RelationshipManager: Internal helper class for managing relationships between objects. This class handles: - - Reading relationships from .rels files + - Reading relationships from energyml.utils.rels files - Writing relationship updates - Supporting 3 update modes (UPDATE_AT_MODIFICATION, UPDATE_ON_CLOSE, MANUAL) - Preserving EXTERNAL_RESOURCE relationships @@ -635,7 +783,6 @@ def __init__( zip_accessor: _ZipFileAccessor, metadata_manager: _MetadataManager, stats: EpcStreamingStats, - export_version: EpcExportVersion, rels_update_mode: RelsUpdateMode, ): """ @@ -645,19 +792,17 @@ def __init__( zip_accessor: ZIP file accessor for reading/writing metadata_manager: Metadata manager for object lookups stats: Statistics tracker - export_version: EPC export version rels_update_mode: Relationship update mode """ self.zip_accessor = zip_accessor self.metadata_manager = metadata_manager self.stats = stats - self.export_version = export_version self.rels_update_mode = rels_update_mode # Additional rels management (for user-added relationships) self.additional_rels: Dict[str, List[Relationship]] = {} - def get_obj_rels(self, obj_identifier: str, rels_path: Optional[str] = None) -> List[Relationship]: + def get_obj_rels(self, obj_identifier: Optional[str] = None, rels_path: Optional[str] = None) -> List[Relationship]: """ Get all relationships for a given object. Merges relationships from the EPC file with in-memory additional relationships. @@ -665,7 +810,7 @@ def get_obj_rels(self, obj_identifier: str, rels_path: Optional[str] = None) -> rels = [] # Read rels from EPC file - if rels_path is None: + if rels_path is None and obj_identifier is not None: rels_path = self.metadata_manager.gen_rels_path_from_identifier(obj_identifier) if rels_path is not None: @@ -680,8 +825,8 @@ def get_obj_rels(self, obj_identifier: str, rels_path: Optional[str] = None) -> pass # Merge with in-memory additional relationships - if obj_identifier in self.additional_rels: - rels.extend(self.additional_rels[obj_identifier]) + if obj_identifier is not None and obj_identifier in self.additional_rels: + rels = self.merge_rels(rels, self.additional_rels[obj_identifier]) return rels @@ -693,305 +838,109 @@ def update_rels_for_new_object(self, obj: Any, obj_identifier: str) -> None: return # Get all objects this new object references - direct_dors = get_direct_dor_list(obj) - - # Build SOURCE relationships for this object - source_relationships = [] - dest_updates: Dict[str, Relationship] = {} - - for dor in direct_dors: - try: - target_identifier = get_obj_identifier(dor) - if not self.metadata_manager.contains(target_identifier): - continue + dest_target_uris = get_dor_uris_from_obj(obj) + # logging.debug(f"Updating relationships for new object {obj_identifier}, found DOR targets: {dest_target_uris}") - target_metadata = self.metadata_manager.get_metadata(target_identifier) - if not target_metadata: - continue + obj_file_path = metadata.file_path(export_version=self.metadata_manager._export_version) - # Create SOURCE relationship - source_rel = Relationship( - target=target_metadata.file_path, - type_value=EPCRelsRelationshipType.SOURCE_OBJECT.get_type(), - id=f"_{obj_identifier}_{get_obj_type(get_obj_usable_class(dor))}_{target_identifier}", - ) - source_relationships.append(source_rel) + dest_rels = [] + source_relationships = {} + for target_uri in dest_target_uris: + target_path = gen_energyml_object_path(target_uri, export_version=self.metadata_manager._export_version) - # Create DESTINATION relationship - dest_rel = Relationship( - target=metadata.file_path, - type_value=EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(), - id=f"_{target_identifier}_{get_obj_type(get_obj_usable_class(obj))}_{obj_identifier}", - ) - dest_updates[target_identifier] = dest_rel + dest_rel = Relationship( + target=target_path, + type_value=get_rels_dor_type(dor_target=target_path, in_dor_owner_rels_file=True), + id=f"_{gen_uuid()}", + ) + dest_rels.append(dest_rel) - except Exception as e: - logging.warning(f"Failed to create relationship for DOR: {e}") + source_relationships[target_path] = Relationship( + target=obj_file_path, + type_value=get_rels_dor_type(dor_target=target_path, in_dor_owner_rels_file=False), + id=f"_{gen_uuid()}", + ) # Write updates - self.write_rels_updates(obj_identifier, source_relationships, dest_updates) + self._write_rels_updates( + current_object_id=obj_identifier, + current_rels_additions=dest_rels, + target_path_rels_additions=source_relationships, + ) - def update_rels_for_modified_object(self, obj: Any, obj_identifier: str, old_dors: List[Any]) -> None: + def update_rels_for_modified_object(self, obj: Any, obj_identifier: str) -> None: """Update relationships when an object is modified (UPDATE_AT_MODIFICATION mode).""" metadata = self.metadata_manager.get_metadata(obj_identifier) if not metadata: logging.warning(f"Metadata not found for {obj_identifier}") return - # Get new DORs - new_dors = get_direct_dor_list(obj) + obj_path = metadata.file_path(export_version=self.metadata_manager._export_version) - # Convert to sets of identifiers for comparison - old_dor_ids = { - get_obj_identifier(dor) for dor in old_dors if self.metadata_manager.contains(get_obj_identifier(dor)) - } - new_dor_ids = { - get_obj_identifier(dor) for dor in new_dors if self.metadata_manager.contains(get_obj_identifier(dor)) + previous_dest_rels_target_path = { + r.target + for r in self.get_obj_rels(obj_identifier) + if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT) and r.target is not None } - - # Find added and removed references - added_dor_ids = new_dor_ids - old_dor_ids - removed_dor_ids = old_dor_ids - new_dor_ids + # Latest DORs from the modified object + dest_target_uris = get_dor_uris_from_obj(obj) + # logging.debug(f"Updating relationships for new object {obj_identifier}, found DOR targets: {dest_target_uris}") # Build new SOURCE relationships - source_relationships = [] - dest_updates: Dict[str, Relationship] = {} + current_rels_additions: List[Relationship] = [] + reversed_source_relationships: Dict[str, Relationship] = {} # Create relationships for all new DORs - for dor in new_dors: - target_identifier = get_obj_identifier(dor) - if not self.metadata_manager.contains(target_identifier): - continue - - target_metadata = self.metadata_manager.get_metadata(target_identifier) - if not target_metadata: - continue - - # SOURCE relationship - source_rel = Relationship( - target=target_metadata.file_path, - type_value=EPCRelsRelationshipType.SOURCE_OBJECT.get_type(), - id=f"_{obj_identifier}_{get_obj_type(get_obj_usable_class(dor))}_{target_identifier}", + for target_uri in dest_target_uris: + target_path = gen_energyml_object_path(target_uri, export_version=self.metadata_manager._export_version) + + # DESTINATION relationship : current is referenced by + dest_rel = Relationship( + target=target_path, + type_value=get_rels_dor_type(dor_target=target_path, in_dor_owner_rels_file=True), + id=f"_{gen_uuid()}", ) - source_relationships.append(source_rel) - - # DESTINATION relationship (for added DORs only) - if target_identifier in added_dor_ids: - dest_rel = Relationship( - target=metadata.file_path, - type_value=EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(), - id=f"_{target_identifier}_{get_obj_type(get_obj_usable_class(obj))}_{obj_identifier}", + current_rels_additions.append(dest_rel) + + if target_path not in previous_dest_rels_target_path: + # REVERSED SOURCE relationship : target references current, if not already existing (to avoid duplicates if DORs are not changed for this target) + source_rel = Relationship( + target=obj_path, + type_value=get_rels_dor_type(dor_target=target_path, in_dor_owner_rels_file=False), + id=f"_{gen_uuid()}", ) - dest_updates[target_identifier] = dest_rel + reversed_source_relationships[target_path] = source_rel - # For removed DORs, remove DESTINATION relationships - removals: Dict[str, str] = {} - for removed_id in removed_dor_ids: - removals[removed_id] = f"_{removed_id}_.*_{obj_identifier}" + # list previous dest that does not exist anymore in the modified object, to remove the corresponding reversed source relationship on target side + outdated_dors_targets_paths = previous_dest_rels_target_path - reversed_source_relationships.keys() # Write updates - self.write_rels_updates(obj_identifier, source_relationships, dest_updates, removals) + self._write_rels_updates( + current_object_id=obj_identifier, + current_rels_additions=list(current_rels_additions), + target_path_rels_additions=reversed_source_relationships, + target_path_rels_removals=outdated_dors_targets_paths, + ) - def update_rels_for_removed_object(self, obj_identifier: str, obj: Optional[Any] = None) -> None: + def update_rels_for_removed_object(self, obj_identifier: str) -> None: """Update relationships when an object is removed (UPDATE_AT_MODIFICATION mode).""" - if obj is None: - # Object must be provided for removal - logging.warning(f"Cannot update rels for removed object {obj_identifier}: object not provided") - return - - # Get all objects this object references - direct_dors = get_direct_dor_list(obj) + current_rels = self.get_obj_rels(obj_identifier) # Ensure we have the latest relationships loaded - # Build removal patterns for DESTINATION relationships - removals: Dict[str, str] = {} - for dor in direct_dors: - try: - target_identifier = get_obj_identifier(dor) - if not self.metadata_manager.contains(target_identifier): - continue + dest_rels = [r for r in current_rels if r.type_value == EPCRelsRelationshipType.DESTINATION_OBJECT.get_type()] - removals[target_identifier] = f"_{target_identifier}_.*_{obj_identifier}" - - except Exception as e: - logging.warning(f"Failed to process DOR for removal: {e}") - - # Write updates - self.write_rels_updates(obj_identifier, [], {}, removals, delete_source_rels=True) - - def write_rels_updates( - self, - source_identifier: str, - source_relationships: List[Relationship], - dest_updates: Dict[str, Relationship], - removals: Optional[Dict[str, str]] = None, - delete_source_rels: bool = False, - ) -> None: - """Write relationship updates to the EPC file efficiently.""" - import re - - removals = removals or {} - rels_updates: Dict[str, str] = {} - files_to_delete: List[str] = [] - - with self.zip_accessor.get_zip_file() as zf: - # 1. Handle source object's rels file - if not delete_source_rels: - source_rels_path = self.metadata_manager.gen_rels_path_from_identifier(source_identifier) - if source_rels_path: - # Read existing rels (excluding SOURCE_OBJECT type) - existing_rels = [] - try: - if source_rels_path in zf.namelist(): - rels_data = zf.read(source_rels_path) - existing_rels_obj = read_energyml_xml_bytes(rels_data, Relationships) - if existing_rels_obj and existing_rels_obj.relationship: - # Keep only non-SOURCE relationships - existing_rels = [ - r - for r in existing_rels_obj.relationship - if r.type_value != EPCRelsRelationshipType.SOURCE_OBJECT.get_type() - ] - except Exception: - pass - - # Combine with new SOURCE relationships - all_rels = existing_rels + source_relationships - if all_rels: - rels_updates[source_rels_path] = serialize_xml(Relationships(relationship=all_rels)) - elif source_rels_path in zf.namelist() and not all_rels: - files_to_delete.append(source_rels_path) - else: - # Mark source rels file for deletion - source_rels_path = self.metadata_manager.gen_rels_path_from_identifier(source_identifier) - if source_rels_path: - files_to_delete.append(source_rels_path) - - # 2. Handle destination updates - for target_identifier, dest_rel in dest_updates.items(): - target_rels_path = self.metadata_manager.gen_rels_path_from_identifier(target_identifier) - if not target_rels_path: - continue - - # Read existing rels - existing_rels = [] - try: - if target_rels_path in zf.namelist(): - rels_data = zf.read(target_rels_path) - existing_rels_obj = read_energyml_xml_bytes(rels_data, Relationships) - if existing_rels_obj and existing_rels_obj.relationship: - existing_rels = list(existing_rels_obj.relationship) - except Exception: - pass - - # Add new DESTINATION relationship if not already present - rel_exists = any( - r.target == dest_rel.target and r.type_value == dest_rel.type_value for r in existing_rels - ) - - if not rel_exists: - existing_rels.append(dest_rel) - rels_updates[target_rels_path] = serialize_xml(Relationships(relationship=existing_rels)) - - # 3. Handle removals - for target_identifier, pattern in removals.items(): - target_rels_path = self.metadata_manager.gen_rels_path_from_identifier(target_identifier) - if not target_rels_path: - continue - - # Read existing rels - existing_rels = [] - try: - if target_rels_path in zf.namelist(): - rels_data = zf.read(target_rels_path) - existing_rels_obj = read_energyml_xml_bytes(rels_data, Relationships) - if existing_rels_obj and existing_rels_obj.relationship: - existing_rels = list(existing_rels_obj.relationship) - except Exception: - pass - - # Filter out relationships matching the pattern - regex = re.compile(pattern) - filtered_rels = [r for r in existing_rels if not (r.id and regex.match(r.id))] - - if len(filtered_rels) != len(existing_rels): - if filtered_rels: - rels_updates[target_rels_path] = serialize_xml(Relationships(relationship=filtered_rels)) - else: - files_to_delete.append(target_rels_path) - - # Write updates to EPC file - if rels_updates or files_to_delete: - with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: - temp_path = temp_file.name - - try: - with self.zip_accessor.get_zip_file() as source_zf: - with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zf: - # Copy all files except those to delete or update - files_to_skip = set(files_to_delete) - for item in source_zf.infolist(): - if item.filename not in files_to_skip and item.filename not in rels_updates: - data = source_zf.read(item.filename) - target_zf.writestr(item, data) - - # Write updated rels files - for rels_path, rels_xml in rels_updates.items(): - target_zf.writestr(rels_path, rels_xml) - - # Replace original - shutil.move(temp_path, self.zip_accessor.epc_file_path) - self.zip_accessor.reopen_persistent_zip() - - except Exception as e: - if os.path.exists(temp_path): - os.unlink(temp_path) - logging.error(f"Failed to write rels updates: {e}") - raise - - def compute_object_rels(self, obj: Any, obj_identifier: str) -> List[Relationship]: - """ - Compute relationships for a given object (SOURCE relationships). - This object references other objects through DORs. - - Args: - obj: The EnergyML object - obj_identifier: The identifier of the object - - Returns: - List of Relationship objects for this object's .rels file - """ - rels = [] - - # Get all DORs (Data Object References) in this object - direct_dors = get_direct_dor_list(obj) - - for dor in direct_dors: - try: - target_identifier = get_obj_identifier(dor) - - # Get target file path from metadata without processing DOR - # The relationship target should be the object's file path, not its rels path - if self.metadata_manager.contains(target_identifier): - target_metadata = self.metadata_manager.get_metadata(target_identifier) - if target_metadata: - target_path = target_metadata.file_path - else: - target_path = gen_energyml_object_path(dor, self.export_version) - else: - # Fall back to generating path from DOR if metadata not found - target_path = gen_energyml_object_path(dor, self.export_version) - - # Create SOURCE relationship (this object -> target object) - rel = Relationship( - target=target_path, - type_value=EPCRelsRelationshipType.SOURCE_OBJECT.get_type(), - id=f"_{obj_identifier}_{get_obj_type(get_obj_usable_class(dor))}_{target_identifier}", - ) - rels.append(rel) - except Exception as e: - logging.warning(f"Failed to create relationship for DOR in {obj_identifier}: {e}") - - return rels + # Write updates + self._write_rels_updates( + current_object_id=obj_identifier, + target_path_rels_removals=[ + r.target + for r in current_rels + if r.target is not None and r.type_value == EPCRelsRelationshipType.DESTINATION_OBJECT.get_type() + ], + delete_current_obj_rels_file_and_file=len(dest_rels) + == len( + current_rels + ), # If all relationships are DESTINATION_OBJECT, we can delete the .rels file entirely. If some source rels exists, we keep it to ease potential add of this element later, to avoid parsing all reals to find its sources rels from other object DEST rels + ) def merge_rels(self, new_rels: List[Relationship], existing_rels: List[Relationship]) -> List[Relationship]: """Merge new relationships with existing ones, avoiding duplicates and ensuring unique IDs. @@ -1023,6 +972,176 @@ def merge_rels(self, new_rels: List[Relationship], existing_rels: List[Relations return merged + def _write_rels_updates( + self, + current_object_id: str, + current_rels_additions: Optional[List[Relationship]] = None, + current_rels_removals: Optional[Union[List[str], Set[str]]] = None, + target_path_rels_additions: Optional[Dict[str, Relationship]] = None, + target_path_rels_removals: Optional[Union[List[str], Set[str]]] = None, + delete_current_obj_rels_file_and_file: bool = False, + ) -> None: + """Write relationship updates to the EPC file efficiently. + + Args: + current_object_id: Identifier of the object being modified/added/removed + current_rels_additions: List of Relationship objects to add to the current object's .rels file + current_rels_removals: List or set of relationship ID patterns to remove from the current object's .rels file + target_path_rels_additions: Dict mapping target object file paths (not the .rels path) to Relationship objects to add to their .rels files (for SOURCE relationships) + target_path_rels_removals: List or set of relationship ID patterns to remove from target objects' .rels files (for SOURCE relationships) + delete_current_obj_rels_file_and_file: If True, deletes the current object's .rels file entirely (if contains only DEST relations) and the object file iteself + + + """ + # Implementation of this method would involve: + # - Reading existing .rels files for current and target objects + # - Merging additions and removals while preserving EXTERNAL_RESOURCE relationships + # - Writing back updated .rels files to the ZIP (either by modifying in place or rebuilding) + # - Handling different update modes (immediate vs on close) + + # 1st : debug log the inputs + # logging.debug( + # f"Writing rels updates for current_object_id={current_object_id}, current_rels_additions={current_rels_additions}, current_rels_removals={current_rels_removals}, target_path_rels_additions={target_path_rels_additions}, target_path_rels_removals={target_path_rels_removals}, delete_current_obj_rels_file_and_file={delete_current_obj_rels_file_and_file}\n\n" + # ) + + current_obj_meta = self.metadata_manager.get_metadata(current_object_id) + if not current_obj_meta: + logging.warning(f"Metadata not found for {current_object_id}, cannot write rels updates") + return + current_object_path = current_obj_meta.file_path(export_version=self.metadata_manager._export_version) + current_rels_path = self.metadata_manager.gen_rels_path_from_metadata(current_obj_meta) + + current_obj_actual_rels = self.get_obj_rels(current_object_id, rels_path=current_rels_path) + + current_updated_rels = ( + self.merge_rels(current_rels_additions, current_obj_actual_rels) + if current_rels_additions + else current_obj_actual_rels + ) + if current_rels_removals: + for removal_obj_id in current_rels_removals: + target_metadata = self.metadata_manager.get_metadata(removal_obj_id) + target_path = ( + target_metadata.file_path(export_version=self.metadata_manager._export_version) + if target_metadata + else None + ) + if target_path: + current_updated_rels = [ + r for r in current_updated_rels if r.target is not None and (target_path not in r.target) + ] + + # Now handle target objects' .rels updates + targets_new_rels_to_path: Dict[str, List[Relationship]] = {} + # First, get existing rels for all target objects + if target_path_rels_additions or target_path_rels_removals: + target_ids = set() + if target_path_rels_additions: + target_ids.update(target_path_rels_additions.keys()) + if target_path_rels_removals: + target_ids.update(target_path_rels_removals) + + for target_id in target_ids: + # we authorize to pass a rels path directly as target_id in target_rels_additions for more flexibility, but if it's not the case we try to find target metadata and generate rels path from it + target_rels_path = None + if target_id.endswith(".xml"): + target_rels_path = gen_rels_path_from_obj_path(target_id) + elif target_id.endswith(".rels"): + target_rels_path = target_id + else: + target_meta = self.metadata_manager.get_metadata(target_id) + if not target_meta: + logging.warning( + f"Metadata not found for target {target_id}, skipping rels updates for this target" + ) + continue + target_rels_path = self.metadata_manager.gen_rels_path_from_metadata(target_meta) + existing_target_rels = self.get_obj_rels(rels_path=target_rels_path) + + # Merge additions and removals for this target + updated_target_rels = existing_target_rels + if target_path_rels_additions and target_id in target_path_rels_additions: + updated_target_rels = self.merge_rels([target_path_rels_additions[target_id]], updated_target_rels) + if target_path_rels_removals and target_id in target_path_rels_removals: + # TODO: maybe we should be able to support non energyml objects and take target path to remove in a tuple in target_rels_removals instead of target_id only ? + updated_target_rels = [r for r in updated_target_rels if r.target != current_object_path] + + targets_new_rels_to_path[target_rels_path] = updated_target_rels + + files_to_delete = [] + if delete_current_obj_rels_file_and_file: + files_to_delete.append(current_object_path) + if ( + len( + [r for r in current_updated_rels if r.type_value != str(EPCRelsRelationshipType.DESTINATION_OBJECT)] + ) + == 0 + ): + # if current object must be removed and its rels file had only dest relationship. We can delete the rels file as well. + files_to_delete.append(current_rels_path) + + rels_updates = {} + if current_rels_additions is not None or current_rels_removals is not None: + rels_updates = {current_rels_path: serialize_xml(Relationships(relationship=current_updated_rels))} + for target_rels_path, updated_rels in targets_new_rels_to_path.items(): + rels_updates[target_rels_path] = serialize_xml(Relationships(relationship=updated_rels)) + + files_to_skip = set(files_to_delete).union(set(rels_updates.keys())) + + # logging.debug( + # f"====\nFiles to delete: {files_to_delete}, rels updates to write: {list(rels_updates.keys())}, files to skip in copy: {files_to_skip}\n\n" + # ) + + # Write in tmp file and then replace original to minimize I/O and handle multiple updates in one operation + with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: + temp_path = temp_file.name + try: + with self.zip_accessor.get_zip_file() as source_zf: + with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zf: + # Copy all files except those to delete or update + ct_xml = None + + for item in source_zf.infolist(): + if get_epc_content_type_path() in item.filename: + ct_xml = source_zf.read(item.filename) + elif item.filename not in files_to_skip and item.filename not in rels_updates: + data = source_zf.read(item.filename) + target_zf.writestr(item, data) + + # Write updated rels files + for rels_path, rels_xml in rels_updates.items(): + target_zf.writestr(rels_path, rels_xml) + # logging.debug(f"Wrote updated rels file: {rels_path} -> {rels_xml}") + + if delete_current_obj_rels_file_and_file: + ct_object: Optional[Types] = None + if ct_xml is not None: + # remove the object entry from [Content_Types].xml if the object file is deleted + ct_object = read_energyml_xml_bytes(ct_xml, Types) + if ct_object is not None: + ct_object.override = [ + o for o in ct_object.override if current_object_path not in (o.part_name or "") + ] + + if ct_object is None: + ct_object = self.metadata_manager.get_content_type(target_zf) + ct_xml = serialize_xml(ct_object) + + if ct_xml is None: + ct_xml = serialize_xml(self.metadata_manager.get_content_type(target_zf)) + + target_zf.writestr(get_epc_content_type_path(), ct_xml) + + # Replace original + shutil.move(temp_path, self.zip_accessor.epc_file_path) + self.zip_accessor.reopen_persistent_zip() + + except Exception as e: + if os.path.exists(temp_path): + os.unlink(temp_path) + logging.error(f"Failed to write rels updates: {e}") + raise + # =========================================================================================== # MAIN CLASS (REFACTORED TO USE HELPER CLASSES) @@ -1030,79 +1149,30 @@ def merge_rels(self, new_rels: List[Relationship], existing_rels: List[Relations class EpcStreamReader(EnergymlStorageInterface): - """ - Memory-efficient EPC file reader with lazy loading and smart caching. - - This class provides the same interface as the standard Epc class but loads - objects on-demand rather than keeping everything in memory. Perfect for - handling very large EPC files with thousands of objects. - - Features: - - Lazy loading: Objects loaded only when accessed - - Smart caching: LRU cache with configurable size - - Memory monitoring: Track memory usage and cache efficiency - - Streaming validation: Validate objects without full loading - - Batch operations: Efficient bulk operations - - Context management: Automatic resource cleanup - - Flexible relationship management: Three modes for updating object relationships - - Relationship Update Modes: - - UPDATE_AT_MODIFICATION: Maintains relationships in real-time as objects are added/removed/modified. - Best for maintaining consistency but may be slower for bulk operations. - - UPDATE_ON_CLOSE: Rebuilds all relationships when closing the EPC file (default). - More efficient for bulk operations but relationships only consistent after closing. - - MANUAL: No automatic relationship updates. User must manually call rebuild_all_rels(). - Maximum control and performance for advanced use cases. - - Performance optimizations: - - Pre-compiled regex patterns for 15-75% faster parsing - - Weak references to prevent memory leaks - - Compressed metadata storage - - Efficient ZIP file handling - """ - def __init__( self, epc_file_path: Union[str, Path], - cache_size: int = 100, - validate_on_load: bool = True, - preload_metadata: bool = True, - export_version: EpcExportVersion = EpcExportVersion.CLASSIC, - force_h5_path: Optional[str] = None, - keep_open: bool = False, - force_title_load: bool = False, rels_update_mode: RelsUpdateMode = RelsUpdateMode.UPDATE_ON_CLOSE, + force_h5_path: Optional[str] = None, + export_version: EpcExportVersion = EpcExportVersion.CLASSIC, enable_parallel_rels: bool = False, parallel_worker_ratio: int = 10, + cache_size: int = 100, + # preload_metadata: bool = True, + keep_open: bool = True, + force_title_load: bool = False, ): - """ - Initialize the EPC stream reader. - - Args: - epc_file_path: Path to the EPC file - cache_size: Maximum number of objects to keep in memory cache - validate_on_load: Whether to validate objects when loading - preload_metadata: Whether to preload all object metadata - export_version: EPC packaging version (CLASSIC or EXPANDED) - force_h5_path: Optional forced HDF5 file path for external resources. If set, all arrays will be read/written from/to this path. - keep_open: If True, keeps the ZIP file open for better performance with multiple operations. File is closed only when instance is deleted or close() is called. - force_title_load: If True, forces loading object titles when listing objects (may impact performance) - rels_update_mode: Mode for updating relationships (UPDATE_AT_MODIFICATION, UPDATE_ON_CLOSE, or MANUAL) - enable_parallel_rels: If True, uses parallel processing for rebuild_all_rels() operations (faster for large EPCs) - parallel_worker_ratio: Number of objects per worker process (default: 10). Lower values = more workers. Only used when enable_parallel_rels=True. - """ # Public attributes self.epc_file_path = Path(epc_file_path) self.enable_parallel_rels = enable_parallel_rels self.parallel_worker_ratio = parallel_worker_ratio self.cache_size = cache_size - self.validate_on_load = validate_on_load self.force_h5_path = force_h5_path self.cache_opened_h5 = None self.keep_open = keep_open self.force_title_load = force_title_load - self.rels_update_mode = rels_update_mode - self.export_version: EpcExportVersion = export_version or EpcExportVersion.CLASSIC + # Note: rels_update_mode will be set on _rels_mgr when it's created below + # self.export_version: EpcExportVersion = export_version or EpcExportVersion.CLASSIC self.stats = EpcStreamingStats() # Caching system using weak references @@ -1110,1704 +1180,888 @@ def __init__( self._access_order: List[str] = [] # LRU tracking is_new_file = False - + # ===================================== # Validate file exists and is readable + # ===================================== if not self.epc_file_path.exists(): - logging.info(f"EPC file not found: {epc_file_path}. Creating a new empty EPC file.") - self._create_empty_epc() + logging.info(f"EPC file not found: {self.epc_file_path}. Creating a new empty EPC file.") + create_mandatory_structure_epc(self.epc_file_path) is_new_file = True if not zipfile.is_zipfile(self.epc_file_path): - raise ValueError(f"File is not a valid ZIP/EPC file: {epc_file_path}") + raise ValueError(f"File is not a valid ZIP/EPC file: {self.epc_file_path}") - # Check if the ZIP file has the required EPC structure - if not is_new_file: - try: - with zipfile.ZipFile(self.epc_file_path, "r") as zf: - content_types_path = get_epc_content_type_path() - if content_types_path not in zf.namelist(): - logging.info("EPC file is missing required structure. Initializing empty EPC file.") - self._create_empty_epc() - is_new_file = True - except Exception as e: - logging.warning(f"Failed to check EPC structure: {e}. Reinitializing.") - - # Initialize helper classes (internal architecture) - self._zip_accessor = _ZipFileAccessor(self.epc_file_path, keep_open=keep_open) - self._metadata_mgr = _MetadataManager(self._zip_accessor, self.stats) - self._rels_mgr = _RelationshipManager( - self._zip_accessor, self._metadata_mgr, self.stats, self.export_version, rels_update_mode - ) + # validate mandatory files and structure, and auto-repair if enabled - # Initialize by loading metadata - if not is_new_file and preload_metadata: - self._metadata_mgr.load_metadata() - # Detect EPC version after loading metadata - self.export_version = self._metadata_mgr.detect_epc_version() - # Update relationship manager's export version - self._rels_mgr.export_version = self.export_version + repair_epc_structure_if_not_valid(self.epc_file_path) - # Open persistent ZIP connection if keep_open is enabled + self._zip_accessor = _ZipFileAccessor(self.epc_file_path, keep_open=keep_open) if keep_open and not is_new_file: self._zip_accessor.open_persistent_connection() - # Backward compatibility: expose internal structures as properties - # This allows existing code to access _metadata, _uuid_index, etc. - self._metadata = self._metadata_mgr._metadata - self._uuid_index = self._metadata_mgr._uuid_index - self._type_index = self._metadata_mgr._type_index - self.additional_rels = self._rels_mgr.additional_rels - - def _create_empty_epc(self) -> None: - """Create an empty EPC file structure.""" - # Ensure directory exists - self.epc_file_path.parent.mkdir(parents=True, exist_ok=True) - - with zipfile.ZipFile(self.epc_file_path, "w") as zf: - # Create [Content_Types].xml - content_types = Types() - content_types_xml = serialize_xml(content_types) - zf.writestr(get_epc_content_type_path(), content_types_xml) - - # Create _rels/.rels - rels = Relationships() - rels_xml = serialize_xml(rels) - zf.writestr("_rels/.rels", rels_xml) - - def _load_metadata(self) -> None: - """Load object metadata from [Content_Types].xml without loading actual objects.""" - # Delegate to metadata manager - self._metadata_mgr.load_metadata() + # ===================================== - def _read_content_types(self, zf: zipfile.ZipFile) -> Types: - """Read and parse [Content_Types].xml file.""" - # Delegate to metadata manager - return self._metadata_mgr._read_content_types(zf) + self._metadata_mgr = _MetadataManager(self._zip_accessor, self.stats) + self._metadata_mgr.load_metadata() # Load metadata at initialization (can be optimized to lazy load if needed) => export version may be auto-detected + if is_new_file: + self._metadata_mgr.set_export_version(export_version) + self._rels_mgr = _RelationshipManager(self._zip_accessor, self._metadata_mgr, self.stats, rels_update_mode) - def _process_energyml_object_metadata(self, zf: zipfile.ZipFile, override: Override) -> None: - """Process metadata for an EnergyML object without loading it.""" - # Delegate to metadata manager - self._metadata_mgr._process_energyml_object_metadata(zf, override) + # Initialize file cache manager for external array files (HDF5, Parquet, CSV, etc.) + # self._file_cache = FileCacheManager(max_open_files=3) + self._handler_registry = get_handler_registry() - def _extract_object_info_fast( - self, zf: zipfile.ZipFile, file_path: str, content_type: str - ) -> Tuple[Optional[str], Optional[str], str]: - """Fast extraction of UUID and version from XML without full parsing.""" - # Delegate to metadata manager - return self._metadata_mgr._extract_object_info_fast(zf, file_path, content_type) + # Register atexit handler to ensure cleanup on program shutdown + self._atexit_registered = True + atexit.register(self._atexit_close) - def _extract_object_type_from_content_type(self, content_type: str) -> str: - """Extract object type from content type string.""" - # Delegate to metadata manager - return self._metadata_mgr._extract_object_type_from_content_type(content_type) + # ================================ + # Properties + # ================================ - def _is_core_properties(self, content_type: str) -> bool: - """Check if content type is CoreProperties.""" - # Delegate to metadata manager - return self._metadata_mgr._is_core_properties(content_type) + @property + def _metadata(self) -> Dict[str, EpcObjectMetadata]: + """Backward compatibility property for accessing metadata.""" + return self._metadata_mgr._metadata - def _process_core_properties_metadata(self, override: Override) -> None: - """Process core properties metadata.""" - # Delegate to metadata manager - self._metadata_mgr._process_core_properties_metadata(override) + @property + def export_version(self) -> EpcExportVersion: + """Get the detected or set export version.""" + return self._metadata_mgr._export_version - def _detect_epc_version(self) -> EpcExportVersion: - """Detect EPC packaging version based on file structure.""" - # Delegate to metadata manager - return self._metadata_mgr.detect_epc_version() + @property + def rels_update_mode(self) -> RelsUpdateMode: + """Get the relationship update mode.""" + return self._rels_mgr.rels_update_mode - def _gen_rels_path_from_metadata(self, metadata: EpcObjectMetadata) -> str: - """Generate rels path from object metadata without loading the object.""" - # Delegate to metadata manager - return self._metadata_mgr.gen_rels_path_from_metadata(metadata) + @rels_update_mode.setter + def rels_update_mode(self, mode: RelsUpdateMode) -> None: + """Set the relationship update mode.""" + if not isinstance(mode, RelsUpdateMode): + raise ValueError(f"Invalid rels_update_mode: {mode}. Must be an instance of RelsUpdateMode Enum.") + self._rels_mgr.rels_update_mode = mode - def _gen_rels_path_from_identifier(self, identifier: str) -> Optional[str]: - """Generate rels path from object identifier without loading the object.""" - # Delegate to metadata manager - return self._metadata_mgr.gen_rels_path_from_identifier(identifier) + # ================================ + # Public API Methods + # ================================ + + def add_object(self, obj: Any, replace_if_exists: bool = True) -> Optional[str]: + """Add an object to the EPC file. Returns the identifier of the added object.""" + # 1. Test if object already exists (by UUID) and handle according to replace_if_exists + # 2. Call put_object to write the object data and metadata to the EPC file + # 3. Update relationships if needed (depending on rels_update_mode) + # 4. Return the identifier of the added object + if not replace_if_exists: + obj_uri: Uri = get_obj_uri(obj=obj, dataspace=None) + if obj_uri is None: + logging.error("Failed to get URI for the object, cannot add to EPC") + return None + obj_identifier = obj_uri.as_identifier() + if self._metadata_mgr.get_metadata(obj_identifier) is not None: + logging.warning( + f"Object with identifier {obj_identifier} already exists and replace_if_exists is False, skipping add" + ) + raise ValueError( + f"Object with identifier {obj_identifier} already exists and replace_if_exists is False" + ) - @contextmanager - def _get_zip_file(self) -> Iterator[zipfile.ZipFile]: - """Context manager for ZIP file access with proper resource management. + return self.put_object(obj=obj) - If keep_open is True, uses the persistent connection. Otherwise opens a new one. - """ - # Delegate to the ZIP accessor helper class - with self._zip_accessor.get_zip_file() as zf: - yield zf + def clear_cache(self) -> None: + """Clear the object cache to free memory.""" + self._object_cache.clear() + self._access_order.clear() + self.stats.loaded_objects = 0 - def get_object_by_identifier(self, identifier: Union[str, Uri]) -> Optional[Any]: + def rebuild_all_rels(self, clean_first: bool = True) -> Dict[str, int]: """ - Get object by its identifier with smart caching. + Rebuild all .rels files from scratch by analyzing all objects and their references. + + This method: + 1. Optionally cleans existing .rels files first + 2. Loads each object temporarily + 3. Analyzes its Data Object References (DORs) + 4. Creates/updates .rels files with proper SOURCE and DESTINATION relationships Args: - identifier: Object identifier (uuid.version) + clean_first: If True, remove all existing .rels files before rebuilding Returns: - The requested object or None if not found + Dictionary with statistics: + - 'objects_processed': Number of objects analyzed + - 'rels_files_created': Number of .rels files created + - 'source_relationships': Number of SOURCE relationships created + - 'destination_relationships': Number of DESTINATION relationships created + - 'parallel_mode': True if parallel processing was used (optional key) + - 'execution_time': Execution time in seconds (optional key) """ - is_uri = isinstance(identifier, Uri) or parse_uri(identifier) is not None - if is_uri: - uri = parse_uri(identifier) if isinstance(identifier, str) else identifier - assert uri is not None and uri.uuid is not None - identifier = uri.uuid + "." + (uri.version or "") + if self.enable_parallel_rels: + return self._rebuild_all_rels_parallel(clean_first) + else: + return self._rebuild_all_rels_sequential(clean_first) - # Check cache first - if identifier in self._object_cache: - self._update_access_order(identifier) # type: ignore - self.stats.cache_hits += 1 - return self._object_cache[identifier] + def add_rels_for_object( + self, identifier: Union[str, Uri, Any], relationships: Union[Relationship, List[Relationship]] + ) -> None: + """ + Add additional relationships for a specific object. - self.stats.cache_misses += 1 + Args: + identifier: The identifier of the object, can be str, Uri, or the object itself + relationships: List of Relationship objects to add + """ + _id = self._id_from_uri_or_identifier(identifier=identifier, get_first_if_simple_uuid=True) - # Check if metadata exists - if identifier not in self._metadata: - return None + if _id is None: + logging.warning(f"Invalid identifier provided for adding relationships: {identifier}") + return - # Load object from file - obj = self._load_object(identifier) + if not isinstance(relationships, list): + relationships = [relationships] - if obj is not None: - # Add to cache with LRU management - self._add_to_cache(identifier, obj) - self.stats.loaded_objects += 1 + if _id not in self._rels_mgr.additional_rels: + self._rels_mgr.additional_rels[_id] = [] + self._rels_mgr.additional_rels[_id].extend(relationships) + self._rels_mgr._write_rels_updates( + current_object_id=_id, + current_rels_additions=relationships, + ) - return obj + def get_statistics(self) -> EpcStreamingStats: + """Get current statistics about the EPC streaming operations.""" + return self.stats - def _load_object(self, identifier: Union[str, Uri]) -> Optional[Any]: - """Load object from EPC file.""" - is_uri = isinstance(identifier, Uri) or parse_uri(identifier) is not None - if is_uri: - uri = parse_uri(identifier) if isinstance(identifier, str) else identifier - assert uri is not None and uri.uuid is not None - identifier = uri.uuid + "." + (uri.version or "") - assert isinstance(identifier, str) - metadata = self._metadata.get(identifier) - if not metadata: - return None + def get_h5_file_paths( + self, obj_or_id: Union[str, Uri, Any] = None, make_path_absolute_from_epc_path: bool = True + ) -> List[str]: + """ + Get all HDF5 file paths referenced in the EPC file (from rels to external resources). + Optimized to avoid loading the object when identifier/URI is provided. - try: - with self._get_zip_file() as zf: - obj_data = zf.read(metadata.file_path) - self.stats.bytes_read += len(obj_data) + :param obj: the object or its identifier/URI + :param make_path_absolute_from_epc_path: If True, return paths absolute from the EPC file path, otherwise return relative paths + :return: list of HDF5 file paths + """ + if self.force_h5_path is not None: + return [self.force_h5_path] + h5_paths = set() - obj_class = get_class_from_content_type(metadata.content_type) - obj = read_energyml_xml_bytes(obj_data, obj_class) + rels_path = None - if self.validate_on_load: - self._validate_object(obj, metadata) + _id = self._id_from_uri_or_identifier(identifier=obj_or_id, get_first_if_simple_uuid=True) + if _id is not None: + rels_path = self._metadata_mgr.gen_rels_path_from_identifier(_id) - return obj + # Check in-memory additional rels first + for rels in self._rels_mgr.additional_rels.get(_id, []): + if rels.type_value == str(EPCRelsRelationshipType.EXTERNAL_RESOURCE): + h5_paths.add(rels.target) - except Exception as e: - logging.error(f"Failed to load object {identifier}: {e}") - return None + # Also check rels from the EPC file + if rels_path is not None: + with self._zip_accessor.get_zip_file() as zf: + try: + rels_data = zf.read(rels_path) + self.stats.bytes_read += len(rels_data) + relationships = read_energyml_xml_bytes(rels_data, Relationships) + for rel in relationships.relationship: + if rel.type_value == str(EPCRelsRelationshipType.EXTERNAL_RESOURCE): + h5_paths.add(rel.target) + except KeyError: + pass - def _validate_object(self, obj: Any, metadata: EpcObjectMetadata) -> None: - """Validate loaded object against metadata.""" - try: - obj_uuid = get_obj_uuid(obj) - if obj_uuid != metadata.uuid: - logging.warning(f"UUID mismatch for {metadata.identifier}: expected {metadata.uuid}, got {obj_uuid}") - except Exception as e: - logging.debug(f"Validation failed for {metadata.identifier}: {e}") + if make_path_absolute_from_epc_path: + h5_paths = set(make_path_relative_to_filepath_list(list(h5_paths), self.epc_file_path)) - def _add_to_cache(self, identifier: Union[str, Uri], obj: Any) -> None: - """Add object to cache with LRU eviction.""" - is_uri = isinstance(identifier, Uri) or parse_uri(identifier) is not None - if is_uri: - uri = parse_uri(identifier) if isinstance(identifier, str) else identifier - assert uri is not None and uri.uuid is not None - identifier = uri.uuid + "." + (uri.version or "") + # if len(h5_paths) == 0: + # Collect all .h5 files in the EPC file's folder + epc_folder = get_file_folder(self.epc_file_path) + if epc_folder is not None and os.path.isdir(epc_folder): + for fname in os.listdir(epc_folder): + if fname.lower().endswith(".h5"): + h5_paths.add(os.path.join(epc_folder, fname)) - assert isinstance(identifier, str) + return list(h5_paths) - # Remove from access order if already present - if identifier in self._access_order: - self._access_order.remove(identifier) + # ________ ___ __________ __ _________________ ______ ____ _____ + # / ____/ / / | / ___/ ___/ / |/ / ____/_ __/ / / / __ \/ __ \/ ___/ + # / / / / / /| | \__ \\__ \ / /|_/ / __/ / / / /_/ / / / / / / /\__ \ + # / /___/ /___/ ___ |___/ /__/ / / / / / /___ / / / __ / /_/ / /_/ /___/ / + # \____/_____/_/ |_/____/____/ /_/ /_/_____/ /_/ /_/ /_/\____/_____//____/ - # Add to front (most recently used) - self._access_order.insert(0, identifier) + # ______ _______ __ ____ __ ____ + # / ____/___ ___ _________ ___ ______ ___ / / ___// /_____ _________ _____ ____ / _/___ / /____ _____/ __/___ _________ + # / __/ / __ \/ _ \/ ___/ __ `/ / / / __ `__ \/ /\__ \/ __/ __ \/ ___/ __ `/ __ `/ _ \ / // __ \/ __/ _ \/ ___/ /_/ __ `/ ___/ _ \ + # / /___/ / / / __/ / / /_/ / /_/ / / / / / / /___/ / /_/ /_/ / / / /_/ / /_/ / __// // / / / /_/ __/ / / __/ /_/ / /__/ __/ + # /_____/_/ /_/\___/_/ \__, /\__, /_/ /_/ /_/_//____/\__/\____/_/ \__,_/\__, /\___/___/_/ /_/\__/\___/_/ /_/ \__,_/\___/\___/ + # /____//____/ /____/ - # Add to cache - self._object_cache[identifier] = obj + def get_object(self, identifier: Union[str, Uri]) -> Optional[Any]: + """ + Retrieve an EnergyML object from the EPC file by its identifier or UUID. - # Evict if cache is full - while len(self._access_order) > self.cache_size: - oldest = self._access_order.pop() - self._object_cache.pop(oldest, None) + This method implements lazy loading with caching for memory efficiency. + If a simple UUID is provided and multiple versions exist, returns the first one. - def _update_access_order(self, identifier: str) -> None: - """Update access order for LRU cache.""" - if identifier in self._access_order: - self._access_order.remove(identifier) - self._access_order.insert(0, identifier) + Args: + identifier: Object identifier (full identifier string or URI) or simple UUID. + Can be: + - A full identifier string (e.g., "eml:///resqml20.obj_TriangulatedSetRepresentation(uuid=abc-123, version='1.0')") + - A Uri object + - A simple UUID string (e.g., "abc-123") - def get_object_by_uuid(self, uuid: str) -> List[Any]: - """Get all objects with the specified UUID.""" - if uuid not in self._uuid_index: - return [] + Returns: + The deserialized EnergyML object, or None if not found or an error occurred. - objects = [] - for identifier in self._uuid_index[uuid]: - obj = self.get_object_by_identifier(identifier) - if obj is not None: - objects.append(obj) + Behavior: + - Checks the in-memory cache first (fast path) + - If not cached, loads from ZIP file and deserializes XML + - Updates cache and LRU access order + - Updates statistics (cache hits/misses, bytes read) - return objects + Notes: + - For simple UUID lookups with multiple versions, returns the first match + - Use get_object_by_uuid() to retrieve all versions of an object + """ + _id = self._id_from_uri_or_identifier(identifier=identifier, get_first_if_simple_uuid=True) + if _id is None: + logging.warning(f"Invalid identifier provided: {identifier}") + return None + metadata = self._metadata_mgr.get_metadata(_id) - def get_object(self, identifier: Union[str, Uri]) -> Optional[Any]: - return self.get_object_by_identifier(identifier) + if metadata is None: + logging.warning(f"Object with identifier {_id} not found in metadata") + return None - def get_objects_by_type(self, object_type: str) -> List[Any]: - """Get all objects of the specified type.""" - if object_type not in self._type_index: - return [] + # Check cache first + if _id in self._object_cache: + self._update_access_order(_id) # type: ignore + self.stats.cache_hits += 1 + return self._object_cache[_id] - objects = [] - for identifier in self._type_index[object_type]: - obj = self.get_object_by_identifier(identifier) - if obj is not None: - objects.append(obj) + self.stats.cache_misses += 1 - return objects + file_path = metadata.file_path(export_version=self._metadata_mgr._export_version) - def list_object_metadata(self, object_type: Optional[str] = None) -> List[EpcObjectMetadata]: - """ - List metadata for objects without loading them. + try: + with self._zip_accessor.get_zip_file() as zf: + obj_data = zf.read(file_path) + self.stats.bytes_read += len(obj_data) - Args: - object_type: Optional filter by object type - - Returns: - List of object metadata - """ - if object_type is None: - return list(self._metadata.values()) - - return [self._metadata[identifier] for identifier in self._type_index.get(object_type, [])] - - def get_statistics(self) -> EpcStreamingStats: - """Get current streaming statistics.""" - return self.stats - - def list_objects( - self, dataspace: Optional[str] = None, object_type: Optional[str] = None - ) -> List[ResourceMetadata]: - """ - List all objects with metadata (EnergymlStorageInterface method). - - Args: - dataspace: Optional dataspace filter (ignored for EPC files) - object_type: Optional type filter (qualified type) - - Returns: - List of ResourceMetadata for all matching objects - """ - - results = [] - metadata_list = self.list_object_metadata(object_type) - - for meta in metadata_list: - try: - # Load object to get title - title = "" - if self.force_title_load and meta.identifier: - obj = self.get_object_by_identifier(meta.identifier) - if obj and hasattr(obj, "citation") and obj.citation: - if hasattr(obj.citation, "title"): - title = obj.citation.title - - # Build URI - qualified_type = content_type_to_qualified_type(meta.content_type) - if meta.version: - uri = f"eml:///{qualified_type}(uuid={meta.uuid},version='{meta.version}')" - else: - uri = f"eml:///{qualified_type}({meta.uuid})" - - resource = ResourceMetadata( - uri=uri, - uuid=meta.uuid, - version=meta.version, - title=title, - object_type=meta.object_type, - content_type=meta.content_type, - ) - - results.append(resource) - except Exception: - continue - - return results - - def get_array_metadata( - self, proxy: Union[str, Uri, Any], path_in_external: Optional[str] = None - ) -> Union[DataArrayMetadata, List[DataArrayMetadata], None]: - """ - Get metadata for data array(s) (EnergymlStorageInterface method). - - Args: - proxy: The object identifier/URI or the object itself - path_in_external: Optional specific path - - Returns: - DataArrayMetadata if path specified, List[DataArrayMetadata] if no path, - or None if not found - """ - from energyml.utils.storage_interface import DataArrayMetadata - - try: - if path_in_external: - array = self.read_array(proxy, path_in_external) - if array is not None: - return DataArrayMetadata( - path_in_resource=path_in_external, - array_type=str(array.dtype), - dimensions=list(array.shape), - ) - else: - # Would need to scan all possible paths - not practical - return [] - except Exception: - pass + obj_class = get_class_from_content_type(metadata.content_type) + obj = read_energyml_xml_bytes(obj_data, obj_class) + # add to cache + self._object_cache[_id] = obj + self._update_access_order(_id) # type: ignore + return obj + except Exception as e: + logging.error(f"Failed to load object {identifier}: {e}") return None - def preload_objects(self, identifiers: List[str]) -> int: - """ - Preload specific objects into cache. - - Args: - identifiers: List of object identifiers to preload - - Returns: - Number of objects successfully loaded - """ - loaded_count = 0 - for identifier in identifiers: - if self.get_object_by_identifier(identifier) is not None: - loaded_count += 1 - return loaded_count - - def clear_cache(self) -> None: - """Clear the object cache to free memory.""" - self._object_cache.clear() - self._access_order.clear() - self.stats.loaded_objects = 0 - - def get_core_properties(self) -> Optional[CoreProperties]: - """Get core properties (loaded lazily).""" - # Delegate to metadata manager - return self._metadata_mgr.get_core_properties() - - def _gen_rels_path_from_metadata(self, metadata: EpcObjectMetadata) -> str: - """ - Generate rels path from object metadata without loading the object. - - Args: - metadata: Object metadata containing file path information - - Returns: - Path to the rels file for this object - """ - obj_path = metadata.file_path - # Extract folder and filename from the object path - if "/" in obj_path: - obj_folder = obj_path[: obj_path.rindex("/") + 1] - obj_file_name = obj_path[obj_path.rindex("/") + 1 :] - else: - obj_folder = "" - obj_file_name = obj_path - - return f"{obj_folder}_rels/{obj_file_name}.rels" - - def _gen_rels_path_from_identifier(self, identifier: str) -> Optional[str]: - """ - Generate rels path from object identifier without loading the object. - - Args: - identifier: Object identifier (uuid.version) - - Returns: - Path to the rels file, or None if metadata not found + def get_object_by_uuid(self, uuid: str) -> List[Any]: """ - metadata = self._metadata.get(identifier) - if metadata is None: - return None - return self._gen_rels_path_from_metadata(metadata) - - def _update_rels_for_new_object(self, obj: Any, obj_identifier: str) -> None: - """Update relationships when a new object is added (UPDATE_AT_MODIFICATION mode).""" - # Delegate to relationship manager - self._rels_mgr.update_rels_for_new_object(obj, obj_identifier) + Retrieve all EnergyML objects with the given UUID from the EPC file. - def _update_rels_for_modified_object(self, obj: Any, obj_identifier: str, old_dors: List[Any]) -> None: - """Update relationships when an object is modified (UPDATE_AT_MODIFICATION mode).""" - # Delegate to relationship manager - self._rels_mgr.update_rels_for_modified_object(obj, obj_identifier, old_dors) - - def _update_rels_for_removed_object(self, obj_identifier: str, obj: Optional[Any] = None) -> None: - """Update relationships when an object is removed (UPDATE_AT_MODIFICATION mode).""" - # Delegate to relationship manager - self._rels_mgr.update_rels_for_removed_object(obj_identifier, obj) - - def _write_rels_updates( - self, - source_identifier: str, - source_relationships: List[Relationship], - dest_updates: Dict[str, Relationship], - removals: Optional[Dict[str, str]] = None, - delete_source_rels: bool = False, - ) -> None: - """Write relationship updates to the EPC file efficiently.""" - # Delegate to relationship manager - self._rels_mgr.write_rels_updates( - source_identifier, source_relationships, dest_updates, removals, delete_source_rels - ) - - def _reopen_persistent_zip(self) -> None: - """Reopen persistent ZIP file after modifications to reflect changes.""" - # Delegate to ZIP accessor - self._zip_accessor.reopen_persistent_zip() - - def to_epc(self, load_all: bool = False) -> Epc: - """ - Convert to standard Epc instance. + This method returns all versions/instances of objects sharing the same UUID. + In well-formed EPC files, typically only one object per UUID exists, but this + method handles cases where multiple versions are present. Args: - load_all: Whether to load all objects into memory + uuid: The UUID string to search for (e.g., "abc-123-def-456") Returns: - Standard Epc instance - """ - epc = Epc() - epc.epc_file_path = str(self.epc_file_path) - core_props = self.get_core_properties() - if core_props is not None: - epc.core_props = core_props - - if load_all: - # Load all objects - for identifier in self._metadata: - obj = self.get_object_by_identifier(identifier) - if obj is not None: - epc.energyml_objects.append(obj) - - return epc - - def set_rels_update_mode(self, mode: RelsUpdateMode) -> None: - """ - Change the relationship update mode. - - Args: - mode: The new RelsUpdateMode to use - - Note: - Changing from MANUAL or UPDATE_ON_CLOSE to UPDATE_AT_MODIFICATION - may require calling rebuild_all_rels() first to ensure consistency. - """ - - def set_rels_update_mode(self, mode: RelsUpdateMode) -> None: - """ - Change the relationship update mode. - - Args: - mode: The new RelsUpdateMode to use - - Note: - Changing from MANUAL or UPDATE_ON_CLOSE to UPDATE_AT_MODIFICATION - may require calling rebuild_all_rels() first to ensure consistency. - """ - if not isinstance(mode, RelsUpdateMode): - raise ValueError(f"mode must be a RelsUpdateMode enum value, got {type(mode)}") - - old_mode = self.rels_update_mode - self.rels_update_mode = mode - # Also update the relationship manager - self._rels_mgr.rels_update_mode = mode - - logging.info(f"Changed relationship update mode from {old_mode.value} to {mode.value}") - - def get_rels_update_mode(self) -> RelsUpdateMode: - """ - Get the current relationship update mode. - - Returns: - The current RelsUpdateMode - """ - return self.rels_update_mode - - def get_obj_rels(self, obj: Union[str, Uri, Any]) -> List[Relationship]: - """ - Get all relationships for a given object. - Merges relationships from the EPC file with in-memory additional relationships. - - Optimized to avoid loading the object when identifier/URI is provided. - - :param obj: the object or its identifier/URI - :return: list of Relationship objects - """ - # Get identifier without loading the object - obj_identifier = None - rels_path = None - - if isinstance(obj, (str, Uri)): - # Convert URI to identifier if needed - if isinstance(obj, Uri) or parse_uri(obj) is not None: - uri = parse_uri(obj) if isinstance(obj, str) else obj - assert uri is not None and uri.uuid is not None - obj_identifier = uri.uuid + "." + (uri.version or "") - else: - obj_identifier = obj - - # Generate rels path from metadata without loading the object - rels_path = self._gen_rels_path_from_identifier(obj_identifier) - else: - # We have the actual object - obj_identifier = get_obj_identifier(obj) - rels_path = gen_rels_path(obj, self.export_version) - - # Delegate to relationship manager - return self._rels_mgr.get_obj_rels(obj_identifier, rels_path) - - def get_h5_file_paths(self, obj: Union[str, Uri, Any]) -> List[str]: - """ - Get all HDF5 file paths referenced in the EPC file (from rels to external resources). - Optimized to avoid loading the object when identifier/URI is provided. - - :param obj: the object or its identifier/URI - :return: list of HDF5 file paths - """ - if self.force_h5_path is not None: - return [self.force_h5_path] - h5_paths = set() - - obj_identifier = None - rels_path = None - - # Get identifier and rels path without loading the object - if isinstance(obj, (str, Uri)): - # Convert URI to identifier if needed - if isinstance(obj, Uri) or parse_uri(obj) is not None: - uri = parse_uri(obj) if isinstance(obj, str) else obj - assert uri is not None and uri.uuid is not None - obj_identifier = uri.uuid + "." + (uri.version or "") - else: - obj_identifier = obj - - # Generate rels path from metadata without loading the object - rels_path = self._gen_rels_path_from_identifier(obj_identifier) - else: - # We have the actual object - obj_identifier = get_obj_identifier(obj) - rels_path = gen_rels_path(obj, self.export_version) - - # Check in-memory additional rels first - for rels in self.additional_rels.get(obj_identifier, []): - if rels.type_value == EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(): - h5_paths.add(rels.target) + List of deserialized EnergyML objects with the given UUID. + Returns empty list if: + - UUID is invalid or None + - No objects with this UUID exist + - All objects failed to load + + Behavior: + - Validates UUID format and type + - Retrieves all identifiers for the UUID from metadata manager + - First collects all cached objects (fast path) + - Then opens ZIP file once to load all non-cached objects in batch (efficient) + - Maintains cache consistency across all loaded objects + - Updates statistics for each object loaded + + Notes: + - Objects are loaded lazily with caching for efficiency + - Cache is updated for each successfully loaded object + - Failed loads are logged but don't prevent other objects from loading + - ZIP file is opened only once for all non-cached objects (performance optimization) + """ + # Type guard: ensure uuid is a string + if not isinstance(uuid, str): + logging.warning(f"get_object_by_uuid called with non-string uuid: {type(uuid)}") + return [] - # Also check rels from the EPC file - if rels_path is not None: - with self._get_zip_file() as zf: - try: - rels_data = zf.read(rels_path) - self.stats.bytes_read += len(rels_data) - relationships = read_energyml_xml_bytes(rels_data, Relationships) - for rel in relationships.relationship: - if rel.type_value == EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(): - h5_paths.add(rel.target) - except KeyError: - pass + # Type guard: ensure uuid is not empty + if not uuid or not uuid.strip(): + logging.warning("get_object_by_uuid called with empty UUID") + return [] - if len(h5_paths) == 0: - # search if an h5 file has the same name than the epc file - epc_folder = os.path.dirname(self.epc_file_path) - if epc_folder is not None and self.epc_file_path is not None: - epc_file_name = os.path.basename(self.epc_file_path) - epc_file_base, _ = os.path.splitext(epc_file_name) - possible_h5_path = os.path.join(epc_folder, epc_file_base + ".h5") - if os.path.exists(possible_h5_path): - h5_paths.add(possible_h5_path) - return list(h5_paths) + # Type guard: validate UUID format + if OptimizedRegex.UUID.fullmatch(uuid) is None: + logging.warning(f"get_object_by_uuid called with invalid UUID format: {uuid}") + return [] - def read_array(self, proxy: Union[str, Uri, Any], path_in_external: str) -> Optional[np.ndarray]: - """ - Read a dataset from the HDF5 file linked to the proxy object. - :param proxy: the object or its identifier - :param path_in_external: the path in the external HDF5 file - :return: the dataset as a numpy array - """ - # Resolve proxy to object + # Get all identifiers for this UUID + identifiers = self._metadata_mgr.get_uuid_identifiers(uuid) - h5_path = [] - if self.force_h5_path is not None: - if self.cache_opened_h5 is None: - self.cache_opened_h5 = h5py.File(self.force_h5_path, "a") - h5_path = [self.cache_opened_h5] - else: - if isinstance(proxy, (str, Uri)): - obj = self.get_object_by_identifier(proxy) - else: - obj = proxy + # Guard: check if identifiers list is valid + if identifiers is None or not isinstance(identifiers, list): + logging.debug(f"No identifiers found for UUID: {uuid}") + return [] - h5_path = self.get_h5_file_paths(obj) + if len(identifiers) == 0: + # logging.debug(f"No objects found with UUID: {uuid}") + return [] - h5_reader = HDF5FileReader() + # Phase 1: Collect cached objects and prepare list of non-cached identifiers + objects = [] + non_cached_metadata = [] # List of (identifier, metadata) tuples to load from ZIP - if h5_path is None or len(h5_path) == 0: - raise ValueError("No HDF5 file paths found for the given proxy object.") - else: - for h5p in h5_path: - # TODO: handle different type of files - try: - return h5_reader.read_array(source=h5p, path_in_external_file=path_in_external) - except Exception: - pass - # logging.error(f"Failed to read HDF5 dataset from {h5p}: {e}") + for identifier in identifiers: + # Type guard: ensure identifier is valid + if not identifier or not isinstance(identifier, str): + logging.warning(f"Skipping invalid identifier in UUID lookup: {identifier}") + continue - def write_array(self, proxy: Union[str, Uri, Any], path_in_external: str, array: np.ndarray) -> bool: - """ - Write a dataset to the HDF5 file linked to the proxy object. - :param proxy: the object or its identifier - :param path_in_external: the path in the external HDF5 file - :param array: the numpy array to write + # Get metadata first to validate object exists + metadata = self._metadata_mgr.get_metadata(identifier) + if metadata is None: + logging.warning(f"Metadata not found for identifier {identifier}, skipping") + continue - return: True if successful - """ - h5_path = [] - if self.force_h5_path is not None: - if self.cache_opened_h5 is None: - self.cache_opened_h5 = h5py.File(self.force_h5_path, "a") - h5_path = [self.cache_opened_h5] - else: - if isinstance(proxy, (str, Uri)): - obj = self.get_object_by_identifier(proxy) + # Check cache first for consistency + if identifier in self._object_cache: + obj = self._object_cache[identifier] + if obj is not None: # Guard: ensure cached object is valid + self._update_access_order(identifier) + self.stats.cache_hits += 1 + objects.append(obj) + else: + # Remove invalid cached entry and mark for re-loading + logging.warning(f"Removing invalid cached object for {identifier}") + del self._object_cache[identifier] + non_cached_metadata.append((identifier, metadata)) + self.stats.cache_misses += 1 else: - obj = proxy - - h5_path = self.get_h5_file_paths(obj) - - h5_writer = HDF5FileWriter() - - if h5_path is None or len(h5_path) == 0: - raise ValueError("No HDF5 file paths found for the given proxy object.") - else: - for h5p in h5_path: - try: - h5_writer.write_array(target=h5p, path_in_external_file=path_in_external, array=array) - return True - except Exception as e: - logging.error(f"Failed to write HDF5 dataset to {h5p}: {e}") - return False - - def validate_all_objects(self, fast_mode: bool = True) -> Dict[str, List[str]]: - """ - Validate all objects in the EPC file. - - Args: - fast_mode: If True, only validate metadata without loading full objects - - Returns: - Dictionary with 'errors' and 'warnings' keys containing lists of issues - """ - results = {"errors": [], "warnings": []} + # Not in cache, need to load from ZIP + non_cached_metadata.append((identifier, metadata)) + self.stats.cache_misses += 1 - for identifier, metadata in self._metadata.items(): + # Phase 2: Load all non-cached objects in a single ZIP file access + if non_cached_metadata: try: - if fast_mode: - # Quick validation - just check file exists and is readable - with self._get_zip_file() as zf: - try: - zf.getinfo(metadata.file_path) - except KeyError: - results["errors"].append(f"Missing file for object {identifier}: {metadata.file_path}") - else: - # Full validation - load and validate object - obj = self.get_object_by_identifier(identifier) - if obj is None: - results["errors"].append(f"Failed to load object {identifier}") - else: - self._validate_object(obj, metadata) - - except Exception as e: - results["errors"].append(f"Validation error for {identifier}: {e}") - - return results + with self._zip_accessor.get_zip_file() as zf: + for identifier, metadata in non_cached_metadata: + file_path = metadata.file_path(export_version=self._metadata_mgr._export_version) - def get_object_dependencies(self, identifier: Union[str, Uri]) -> List[str]: - """ - Get list of object identifiers that this object depends on. - - This would need to be implemented based on DOR analysis. - """ - # Placeholder for dependency analysis - # Would need to parse DORs in the object - return [] - - def __len__(self) -> int: - """Return total number of objects in EPC.""" - return len(self._metadata) + try: + obj_data = zf.read(file_path) + self.stats.bytes_read += len(obj_data) - def __contains__(self, identifier: str) -> bool: - """Check if object with identifier exists.""" - return identifier in self._metadata + obj_class = get_class_from_content_type(metadata.content_type) + obj = read_energyml_xml_bytes(obj_data, obj_class) - def __iter__(self) -> Iterator[str]: - """Iterate over object identifiers.""" - return iter(self._metadata.keys()) + # Guard: validate deserialized object + if obj is None: + logging.warning(f"Deserialization returned None for {identifier}") + continue - def __enter__(self): - """Context manager entry.""" - return self + # Add to cache with consistency check + self._object_cache[identifier] = obj + self._update_access_order(identifier) + objects.append(obj) - def __exit__(self, exc_type, exc_val, exc_tb): - """Context manager exit with cleanup.""" - self.clear_cache() - self.close() - if self.cache_opened_h5 is not None: - try: - self.cache_opened_h5.close() - except Exception: - pass - self.cache_opened_h5 = None - - def __del__(self): - """Destructor to ensure persistent ZIP file is closed.""" - try: - self.close() - if self.cache_opened_h5 is not None: - try: - self.cache_opened_h5.close() - except Exception: - pass - self.cache_opened_h5 = None - except Exception: - pass # Ignore errors during cleanup + except KeyError: + logging.error(f"File not found in ZIP for identifier {identifier}: {file_path}") + except Exception as e: + logging.error(f"Failed to deserialize object {identifier}: {e}") - def close(self) -> None: - """Close the persistent ZIP file if it's open, recomputing rels first if mode is UPDATE_ON_CLOSE.""" - # Recompute all relationships before closing if in UPDATE_ON_CLOSE mode - if self.rels_update_mode == RelsUpdateMode.UPDATE_ON_CLOSE: - try: - self.rebuild_all_rels(clean_first=True) - logging.info("Rebuilt all relationships on close (UPDATE_ON_CLOSE mode)") except Exception as e: - logging.warning(f"Error rebuilding rels on close: {e}") + logging.error(f"Failed to open ZIP file for batch loading: {e}") - # Delegate to ZIP accessor - self._zip_accessor.close() + return objects def put_object(self, obj: Any, dataspace: Optional[str] = None) -> Optional[str]: - """ - Store an energyml object (EnergymlStorageInterface method). - - Args: - obj: The energyml object to store - dataspace: Optional dataspace name (ignored for EPC files) - - Returns: - The identifier of the stored object (UUID.version or UUID), or None on error - """ - try: - return self.add_object(obj, replace_if_exists=True) - except Exception: - return None - - def add_object(self, obj: Any, file_path: Optional[str] = None, replace_if_exists: bool = True) -> str: - """ - Add a new object to the EPC file and update caches. - - Args: - obj: The EnergyML object to add - file_path: Optional custom file path, auto-generated if not provided - replace_if_exists: If True, replace the object if it already exists. If False, raise ValueError. - - Returns: - The identifier of the added object - - Raises: - ValueError: If object is invalid or already exists (when replace_if_exists=False) - RuntimeError: If file operations fail - """ - identifier = None - metadata = None - - try: - # Extract object information - identifier = get_obj_identifier(obj) - uuid = identifier.split(".")[0] if identifier else None - - if not uuid: - raise ValueError("Object must have a valid UUID") - - version = identifier[len(uuid) + 1 :] if identifier and "." in identifier else None - # Ensure version is treated as a string, not an integer - if version is not None and not isinstance(version, str): - version = str(version) - - object_type = get_object_type_for_file_path_from_class(obj) - - if identifier in self._metadata: - if replace_if_exists: - # Remove the existing object first - logging.info(f"Replacing existing object {identifier}") - self.remove_object(identifier) - else: - raise ValueError( - f"Object with identifier {identifier} already exists. Use update_object() or set replace_if_exists=True." - ) - - # Generate file path if not provided - file_path = gen_energyml_object_path(obj, self.export_version) - - print(f"Generated file path: {file_path} for export version: {self.export_version}") - - # Determine content type based on object type - content_type = get_obj_content_type(obj) - - # Create metadata - metadata = EpcObjectMetadata( - uuid=uuid, - object_type=object_type, - content_type=content_type, - file_path=file_path, - version=version, - identifier=identifier, - ) - - # Update internal structures - self._metadata[identifier] = metadata - - # Update UUID index - if uuid not in self._uuid_index: - self._uuid_index[uuid] = [] - self._uuid_index[uuid].append(identifier) - - # Update type index - if object_type not in self._type_index: - self._type_index[object_type] = [] - self._type_index[object_type].append(identifier) - - # Add to cache - self._add_to_cache(identifier, obj) - - # Save changes to file - self._add_object_to_file(obj, metadata) - - # Update relationships if in UPDATE_AT_MODIFICATION mode - if self.rels_update_mode == RelsUpdateMode.UPDATE_AT_MODIFICATION: - self._update_rels_for_new_object(obj, identifier) - - # Update stats - self.stats.total_objects += 1 - - logging.info(f"Added object {identifier} to EPC file") - return identifier - - except Exception as e: - logging.error(f"Failed to add object: {e}") - # Rollback changes if we created metadata - if identifier and metadata: - self._rollback_add_object(identifier) - raise RuntimeError(f"Failed to add object to EPC: {e}") - - def delete_object(self, identifier: Union[str, Uri]) -> bool: - """ - Delete an object by its identifier (EnergymlStorageInterface method). - - Args: - identifier: Object identifier (UUID or UUID.version) or ETP URI - - Returns: - True if successfully deleted, False otherwise - """ - return self.remove_object(identifier) - - def remove_object(self, identifier: Union[str, Uri]) -> bool: - """ - Remove an object (or all versions of an object) from the EPC file and update caches. - - Args: - identifier: The identifier of the object to remove. Can be either: - - Full identifier (uuid.version) to remove a specific version - - UUID only to remove ALL versions of that object - - Returns: - True if object(s) were successfully removed, False if not found - - Raises: - RuntimeError: If file operations fail - """ - try: - is_uri = isinstance(identifier, Uri) or parse_uri(identifier) is not None - if is_uri: - uri = parse_uri(identifier) if isinstance(identifier, str) else identifier - assert uri is not None and uri.uuid is not None - identifier = uri.uuid + "." + (uri.version or "") - assert isinstance(identifier, str) - - if identifier not in self._metadata: - # Check if identifier is a UUID only (should remove all versions) - if identifier in self._uuid_index: - # Remove all versions for this UUID - identifiers_to_remove = self._uuid_index[identifier].copy() - removed_count = 0 - - for id_to_remove in identifiers_to_remove: - if self._remove_single_object(id_to_remove): - removed_count += 1 - - return removed_count > 0 - else: - return False - - # Single identifier removal - return self._remove_single_object(identifier) - - except Exception as e: - logging.error(f"Failed to remove object {identifier}: {e}") - raise RuntimeError(f"Failed to remove object from EPC: {e}") - - def _remove_single_object(self, identifier: str) -> bool: - """ - Remove a single object by its full identifier. - - Args: - identifier: The full identifier (uuid.version) of the object to remove - Returns: - True if the object was successfully removed, False otherwise - """ - try: - if identifier not in self._metadata: - return False - - metadata = self._metadata[identifier] - - # If in UPDATE_AT_MODIFICATION mode, update rels before removing - obj = None - if self.rels_update_mode == RelsUpdateMode.UPDATE_AT_MODIFICATION: - obj = self.get_object_by_identifier(identifier) - if obj: - self._update_rels_for_removed_object(identifier, obj) - - # IMPORTANT: Remove from file FIRST (before clearing cache/metadata) - # because _remove_object_from_file needs to load the object to access its DORs - self._remove_object_from_file(metadata) - - # Now remove from cache - if identifier in self._object_cache: - del self._object_cache[identifier] - - if identifier in self._access_order: - self._access_order.remove(identifier) - - # Remove from indexes - uuid = metadata.uuid - object_type = metadata.object_type - - if uuid in self._uuid_index: - if identifier in self._uuid_index[uuid]: - self._uuid_index[uuid].remove(identifier) - if not self._uuid_index[uuid]: - del self._uuid_index[uuid] - - if object_type in self._type_index: - if identifier in self._type_index[object_type]: - self._type_index[object_type].remove(identifier) - if not self._type_index[object_type]: - del self._type_index[object_type] - - # Remove from metadata (do this last) - del self._metadata[identifier] - - # Update stats - self.stats.total_objects -= 1 - if self.stats.loaded_objects > 0: - self.stats.loaded_objects -= 1 - - logging.info(f"Removed object {identifier} from EPC file") - return True - - except Exception as e: - logging.error(f"Failed to remove single object {identifier}: {e}") - return False + # 1. Generate identifier and metadata for the object + # 2. Write object data and metadata to the EPC file in a temporary file and then replace original to minimize I/O + # 3. Update relationships if needed (depending on rels_update_mode) + # 4. Return the identifier of the added/updated object - def update_object(self, obj: Any) -> str: - """ - Update an existing object in the EPC file. + uri = get_obj_uri(obj=obj, dataspace=None) + if uri is None: + raise ValueError("Failed to generate URI for the object, cannot put into EPC") - Args: - obj: The EnergyML object to update - Returns: - The identifier of the updated object - """ - identifier = get_obj_identifier(obj) - if not identifier or identifier not in self._metadata: - raise ValueError("Object must have a valid identifier and exist in the EPC file") + identifier = uri.as_identifier() + existing_metadata = self._metadata_mgr.get_metadata(identifier) + file_path = gen_energyml_object_path(obj, self._metadata_mgr._export_version) + # is_update = existing_metadata is not None + # Write object data and metadata to EPC try: - # If in UPDATE_AT_MODIFICATION mode, get old DORs and handle update differently - if self.rels_update_mode == RelsUpdateMode.UPDATE_AT_MODIFICATION: - old_obj = self.get_object_by_identifier(identifier) - old_dors = get_direct_dor_list(old_obj) if old_obj else [] + file_allready_exists = False - # Preserve non-SOURCE/DESTINATION relationships (like EXTERNAL_RESOURCE) before removal - preserved_rels = [] - try: - obj_rels = self.get_obj_rels(identifier) - preserved_rels = [ - r - for r in obj_rels - if r.type_value - not in ( - EPCRelsRelationshipType.SOURCE_OBJECT.get_type(), - EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(), + with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: + temp_path = temp_file.name + with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as zf: + epc_content_type = None + # Copy all existing files except the one being updated (if update) and its .rels file + with self._zip_accessor.get_zip_file() as source_zf: + for item in source_zf.infolist(): + # logging.debug( + # f"Test {get_epc_content_type_path() in item.filename} with {item.filename} and {get_epc_content_type_path()} " + # ) + if get_epc_content_type_path() in item.filename: + epc_content_type = source_zf.read(item.filename) + elif item.filename != file_path: + data = source_zf.read(item.filename) + zf.writestr(item, data) + else: + file_allready_exists = True + + # Write new/updated object data + obj_xml_bytes = serialize_xml(obj) + zf.writestr(file_path, obj_xml_bytes) + + if not file_allready_exists: + ct_object = None + if epc_content_type is not None: + # logging.debug("Existing content type found, adding new object to it") + # add the new object to the existing content type and write it + ct_object = read_energyml_xml_bytes(epc_content_type, Types) + # logging.debug("Existing content type before adding object: " + str(ct_object)) + ct_object.override.append( + Override(part_name=file_path, content_type=get_content_type_from_class(obj)) ) - ] - except Exception: - pass - - # Remove existing object (without rels update since we're replacing it) - # Temporarily switch to MANUAL mode to avoid double updates - original_mode = self.rels_update_mode - self.rels_update_mode = RelsUpdateMode.MANUAL - self.remove_object(identifier) - self.rels_update_mode = original_mode - - # Add updated object (without rels update since we'll do custom update) - self.rels_update_mode = RelsUpdateMode.MANUAL - new_identifier = self.add_object(obj) - self.rels_update_mode = original_mode - - # Now do the specialized update that handles both adds and removes - self._update_rels_for_modified_object(obj, new_identifier, old_dors) - - # Restore preserved relationships (like EXTERNAL_RESOURCE) - if preserved_rels: - # These need to be written directly to the rels file - # since _update_rels_for_modified_object already wrote it - rels_path = self._gen_rels_path_from_identifier(new_identifier) - if rels_path: - with self._get_zip_file() as zf: - # Read current rels - current_rels = [] - try: - if rels_path in zf.namelist(): - rels_data = zf.read(rels_path) - rels_obj = read_energyml_xml_bytes(rels_data, Relationships) - if rels_obj and rels_obj.relationship: - current_rels = list(rels_obj.relationship) - except Exception: - pass - - # Add preserved rels - all_rels = current_rels + preserved_rels - - # Write back - with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: - temp_path = temp_file.name - - try: - with self._get_zip_file() as source_zf: - with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zf: - # Copy all files except the rels file we're updating - for item in source_zf.infolist(): - if item.filename != rels_path: - buffer = source_zf.read(item.filename) - target_zf.writestr(item, buffer) - - # Write updated rels file - target_zf.writestr( - rels_path, serialize_xml(Relationships(relationship=all_rels)) - ) - - # Replace original - shutil.move(temp_path, self.epc_file_path) - self._reopen_persistent_zip() - - except Exception: - if os.path.exists(temp_path): - os.unlink(temp_path) - raise - + if ct_object is None: + # logging.debug("No existing content type found, generating new one from metadata manager") + ct_object = self._metadata_mgr.get_content_type(zf) + # logging.debug("New content type after adding object: " + str(ct_object)) + zf.writestr(get_epc_content_type_path(), serialize_xml(ct_object)) + # logging.debug("Written content type to EPC with new object : " + serialize_xml(ct_object)) + elif epc_content_type is not None: + zf.writestr(get_epc_content_type_path(), epc_content_type) + # Replace original + shutil.move(temp_path, self.epc_file_path) + self._zip_accessor.reopen_persistent_zip() + except Exception as e: + raise IOError(f"Failed to write object to EPC: {e}") + + # adding the metadata to the metadata manager (after writing the file to ensure we have the correct export version for path generation) + last_update = get_object_attribute_advanced(obj, "citation.lastUpdate") + if last_update is None and isinstance(last_update, str): + last_update = date_to_datetime(last_update) + self._metadata_mgr.add_metadata(EpcObjectMetadata(uri=uri, title=get_obj_title(obj), last_changed=last_update)) + + # update relationships if needed + if self.rels_update_mode == RelsUpdateMode.UPDATE_AT_MODIFICATION: + if file_allready_exists: + self._rels_mgr.update_rels_for_modified_object(obj, identifier) else: - # For other modes (UPDATE_ON_CLOSE, MANUAL), preserve non-SOURCE/DESTINATION relationships - preserved_rels = [] - try: - obj_rels = self.get_obj_rels(identifier) - preserved_rels = [ - r - for r in obj_rels - if r.type_value - not in ( - EPCRelsRelationshipType.SOURCE_OBJECT.get_type(), - EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(), - ) - ] - except Exception: - pass - - # Simple remove + add - self.remove_object(identifier) - new_identifier = self.add_object(obj) - - # Restore preserved relationships if any - if preserved_rels: - self.add_rels_for_object(new_identifier, preserved_rels, write_immediately=True) - - logging.info(f"Updated object {identifier} to {new_identifier} in EPC file") - return new_identifier + self._rels_mgr.update_rels_for_new_object(obj, identifier) + + return identifier + + def delete_object(self, identifier: Union[str, Uri, Any]) -> bool: + # 1. Validate identifier and check if object exists + # 2. Update rels by removing from current object rels the "Destination" relationships to the deleted object and from other objects rels the "Source" relationships to the deleted object (depending on rels_update_mode) + # 3. Remove object data and metadata from the EPC file in a temporary file and then replace original to minimize I/O + # 4. Return True if deletion was successful, False otherwise + _id = self._id_from_uri_or_identifier(identifier=identifier) + if _id is None: + logging.warning(f"Invalid identifier provided for deletion: {identifier}") + return False + metadata = self._metadata_mgr.get_metadata(_id) + if metadata is None: + logging.warning(f"Object with identifier {_id} not found in metadata, cannot delete") + return False - except Exception as e: - logging.error(f"Failed to update object {identifier}: {e}") - raise RuntimeError(f"Failed to update object in EPC: {e}") + if self.rels_update_mode == RelsUpdateMode.UPDATE_AT_MODIFICATION: + self._rels_mgr.update_rels_for_removed_object( + _id + ) # will update content_type when removing the object if needed - def add_rels_for_object( - self, identifier: Union[str, Uri, Any], relationships: List[Relationship], write_immediately: bool = False - ) -> None: - """ - Add additional relationships for a specific object. + # update metadata manager to remove the metadata of the deleted object + self._metadata_mgr.remove_metadata(_id) + return True - Relationships are stored in memory and can be written immediately or deferred - until write_pending_rels() is called, or when the EPC is closed. + def read_array( + self, + proxy: Union[str, Uri, Any], + path_in_external: str, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + external_uri: Optional[str] = None, + ) -> Optional[np.ndarray]: + """ + Read a dataset from an external file (HDF5, Parquet, CSV, etc.) linked to the proxy object. + + Uses an intelligent caching mechanism that: + 1. Checks cached open files first (up to 3 files kept open) + 2. Tries all possible file paths + 3. Automatically selects the correct reader based on file extension + 4. Adds successfully opened files to cache + 5. Supports RESQML v2.2 sub-array selection via start_indices and counts Args: - identifier: The identifier of the object, can be str, Uri, or the object itself - relationships: List of Relationship objects to add - write_immediately: If True, writes pending rels to disk immediately after adding. - If False (default), rels are kept in memory for batching. - """ - is_uri = isinstance(identifier, Uri) or (isinstance(identifier, str) and parse_uri(identifier) is not None) - if is_uri: - uri = parse_uri(identifier) if isinstance(identifier, str) else identifier - assert uri is not None and uri.uuid is not None - identifier = uri.uuid + "." + (uri.version or "") - elif not isinstance(identifier, str): - identifier = get_obj_identifier(identifier) - - assert isinstance(identifier, str) - - if identifier not in self.additional_rels: - self.additional_rels[identifier] = [] - - self.additional_rels[identifier].extend(relationships) - logging.debug(f"Added {len(relationships)} relationships for object {identifier} (in-memory)") - - if write_immediately: - self.write_pending_rels() - - def write_pending_rels(self) -> int: - """ - Write all pending in-memory relationships to the EPC file efficiently. - - This method reads existing rels, merges them in memory with pending rels, - then rewrites only the affected rels files in a single ZIP update. + proxy: The object, its identifier, or URI + path_in_external: Path/dataset name within the external file + start_indices: Optional start index for each dimension (auto-extracted from proxy if not provided) + counts: Optional count of elements for each dimension (auto-extracted from proxy if not provided) + external_uri: Optional URI to override file path resolution (auto-extracted from proxy if not provided) Returns: - Number of rels files updated - """ - if not self.additional_rels: - logging.debug("No pending relationships to write") - return 0 - - updated_count = 0 - - # Step 1: Read existing rels and merge with pending rels in memory - merged_rels: Dict[str, Relationships] = {} # rels_path -> merged Relationships - - with self._get_zip_file() as zf: - for obj_identifier, new_relationships in self.additional_rels.items(): - # Generate rels path from metadata without loading the object - rels_path = self._gen_rels_path_from_identifier(obj_identifier) - if rels_path is None: - logging.warning(f"Could not generate rels path for {obj_identifier}") - continue - - # Read existing rels from ZIP - existing_relationships = [] - try: - if rels_path in zf.namelist(): - rels_data = zf.read(rels_path) - existing_rels = read_energyml_xml_bytes(rels_data, Relationships) - if existing_rels and existing_rels.relationship: - existing_relationships = list(existing_rels.relationship) - except Exception as e: - logging.debug(f"Could not read existing rels for {rels_path}: {e}") - - # Merge new relationships, avoiding duplicates - for new_rel in new_relationships: - # Check if relationship already exists - rel_exists = any( - r.target == new_rel.target and r.type_value == new_rel.type_value - for r in existing_relationships - ) - - if not rel_exists: - # Ensure unique ID - cpt = 0 - new_rel_id = new_rel.id - while any(r.id == new_rel_id for r in existing_relationships): - new_rel_id = f"{new_rel.id}_{cpt}" - cpt += 1 - if new_rel_id != new_rel.id: - new_rel.id = new_rel_id - - existing_relationships.append(new_rel) - - # Store merged result - if existing_relationships: - merged_rels[rels_path] = Relationships(relationship=existing_relationships) - - # Step 2: Write updated rels back to ZIP (create temp, copy all, replace) - if not merged_rels: - return 0 - - with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: - temp_path = temp_file.name - - try: - # Copy entire ZIP, replacing only the updated rels files - with self._get_zip_file() as source_zf: - with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zf: - # Copy all files except the rels we're updating - for item in source_zf.infolist(): - if item.filename not in merged_rels: - buffer = source_zf.read(item.filename) - target_zf.writestr(item, buffer) - - # Write updated rels files - for rels_path, relationships in merged_rels.items(): - rels_xml = serialize_xml(relationships) - target_zf.writestr(rels_path, rels_xml) - updated_count += 1 - - # Replace original with updated ZIP - shutil.move(temp_path, self.epc_file_path) - self._reopen_persistent_zip() - - # Clear pending rels after successful write - self.additional_rels.clear() - - logging.info(f"Wrote {updated_count} rels files to EPC") - return updated_count - - except Exception as e: - if os.path.exists(temp_path): - os.unlink(temp_path) - logging.error(f"Failed to write pending rels: {e}") - raise - - def _compute_object_rels(self, obj: Any, obj_identifier: str) -> List[Relationship]: - """Compute relationships for a given object (SOURCE relationships). - - Delegates to _rels_mgr.compute_object_rels() - """ - return self._rels_mgr.compute_object_rels(obj, obj_identifier) - - def _merge_rels(self, new_rels: List[Relationship], existing_rels: List[Relationship]) -> List[Relationship]: - """Merge new relationships with existing ones, avoiding duplicates and ensuring unique IDs. - - Delegates to _rels_mgr.merge_rels() + Numpy array if successful, None otherwise. Returns sub-selected portion if start_indices/counts provided. """ - return self._rels_mgr.merge_rels(new_rels, existing_rels) - - def _add_object_to_file(self, obj: Any, metadata: EpcObjectMetadata) -> None: - """Add object to the EPC file efficiently. - - Reads existing rels, computes updates in memory, then writes everything - in a single ZIP operation. - """ - xml_content = serialize_xml(obj) - obj_identifier = metadata.identifier - assert obj_identifier is not None, "Object identifier must not be None" - - # Step 1: Compute which rels files need to be updated and prepare their content - rels_updates: Dict[str, str] = {} # rels_path -> XML content - - with self._get_zip_file() as zf: - # 1a. Object's own .rels file - obj_rels_path = gen_rels_path(obj, self.export_version) - obj_relationships = self._compute_object_rels(obj, obj_identifier) + # Get possible file paths for this object + file_paths = self.get_h5_file_paths(proxy) - if obj_relationships: - # Read existing rels - existing_rels = [] - try: - if obj_rels_path in zf.namelist(): - rels_data = zf.read(obj_rels_path) - existing_rels_obj = read_energyml_xml_bytes(rels_data, Relationships) - if existing_rels_obj and existing_rels_obj.relationship: - existing_rels = list(existing_rels_obj.relationship) - except Exception: - pass - - # Merge and serialize - merged_rels = self._merge_rels(obj_relationships, existing_rels) - if merged_rels: - rels_updates[obj_rels_path] = serialize_xml(Relationships(relationship=merged_rels)) - - # 1b. Update rels of referenced objects (DESTINATION relationships) - direct_dors = get_direct_dor_list(obj) - for dor in direct_dors: - try: - target_identifier = get_obj_identifier(dor) - - # Generate rels path from metadata without processing DOR - target_rels_path = self._gen_rels_path_from_identifier(target_identifier) - if target_rels_path is None: - # Fall back to generating from DOR if metadata not found - target_rels_path = gen_rels_path(dor, self.export_version) - - # Create DESTINATION relationship - dest_rel = Relationship( - target=metadata.file_path, - type_value=EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(), - id=f"_{target_identifier}_{get_obj_type(get_obj_usable_class(obj))}_{obj_identifier}", - ) - - # Read existing rels - existing_rels = [] - try: - if target_rels_path in zf.namelist(): - rels_data = zf.read(target_rels_path) - existing_rels_obj = read_energyml_xml_bytes(rels_data, Relationships) - if existing_rels_obj and existing_rels_obj.relationship: - existing_rels = list(existing_rels_obj.relationship) - except Exception: - pass - - # Merge and serialize - merged_rels = self._merge_rels([dest_rel], existing_rels) - if merged_rels: - rels_updates[target_rels_path] = serialize_xml(Relationships(relationship=merged_rels)) - - except Exception as e: - logging.warning(f"Failed to prepare rels update for referenced object: {e}") - - # 1c. Update [Content_Types].xml - content_types_xml = self._update_content_types_xml(zf, metadata, add=True) + if external_uri: + file_paths.insert(0, make_path_relative_to_other_file(external_uri, self.epc_file_path)) - # Step 2: Write everything to new ZIP - with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: - temp_path = temp_file.name - - try: - with self._get_zip_file() as source_zf: - with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zf: - # Write new object - target_zf.writestr(metadata.file_path, xml_content) - - # Write updated [Content_Types].xml - target_zf.writestr(get_epc_content_type_path(), content_types_xml) + if not file_paths: + logging.warning(f"No external file paths found for proxy: {proxy}") + return None - # Write updated rels files - for rels_path, rels_xml in rels_updates.items(): - target_zf.writestr(rels_path, rels_xml) + # Get the file handler registry + handler_registry = get_handler_registry() - # Copy all other files - files_to_skip = {get_epc_content_type_path(), metadata.file_path} - files_to_skip.update(rels_updates.keys()) + for file_path in file_paths: + # Get the appropriate handler for this file type + handler = handler_registry.get_handler_for_file(file_path) + if handler is None: + logging.debug(f"No handler found for file: {file_path}") + continue - for item in source_zf.infolist(): - if item.filename not in files_to_skip: - buffer = source_zf.read(item.filename) - target_zf.writestr(item, buffer) + try: + # Use handler to read array with sub-selection support + array = handler.read_array(file_path, path_in_external, start_indices, counts) + if array is not None: + return array + except Exception as e: + logging.debug(f"Failed to read dataset from {file_path}: {e}") + pass - # Replace original - shutil.move(temp_path, self.epc_file_path) - self._reopen_persistent_zip() + logging.error(f"Failed to read array from any available file paths: {file_paths}") + return None - except Exception as e: - if os.path.exists(temp_path): - os.unlink(temp_path) - logging.error(f"Failed to add object to EPC file: {e}") - raise + def read_array_view( + self, + proxy: Union[str, Uri, Any], + path_in_external: str, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + external_uri: Optional[str] = None, + ) -> Optional[np.ndarray]: + """Best-effort zero-copy variant of :meth:`read_array`. + + Delegates to ``handler.read_array_view`` when available (HDF5), which + returns a numpy array backed by the file buffer for contiguous, + uncompressed datasets. Falls back transparently to a copy for chunked + or compressed data. + """ + file_paths = self.get_h5_file_paths(proxy) + if external_uri: + file_paths.insert(0, make_path_relative_to_other_file(external_uri, self.epc_file_path)) + if not file_paths: + return None - def _remove_object_from_file(self, metadata: EpcObjectMetadata) -> None: - """Remove object from the EPC file efficiently. + handler_registry = get_handler_registry() + for file_path in file_paths: + handler = handler_registry.get_handler_for_file(file_path) + if handler is None: + continue + try: + read_view_fn = getattr(handler, "read_array_view", None) + if read_view_fn is not None: + array = read_view_fn(file_path, path_in_external, start_indices, counts) + else: + array = handler.read_array(file_path, path_in_external, start_indices, counts) + if array is not None: + return array + except Exception as e: + logging.debug(f"Failed to read_array_view from {file_path}: {e}") + return None - Reads existing rels, computes updates in memory, then writes everything - in a single ZIP operation. Note: This does NOT remove .rels files. - Use clean_rels() to remove orphaned relationships. + def write_array( + self, + proxy: Union[str, Uri, Any], + path_in_external: str, + array: np.ndarray, + start_indices: Optional[List[int]] = None, + external_uri: Optional[str] = None, + **kwargs, + ) -> bool: """ - # Load object first (needed to process its DORs) - if metadata.identifier is None: - logging.error("Cannot remove object with None identifier") - raise ValueError("Object identifier must not be None") + Write a dataset to an external file (HDF5, Parquet, CSV, etc.) linked to the proxy object. - obj = self.get_object_by_identifier(metadata.identifier) - if obj is None: - logging.warning(f"Object {metadata.identifier} not found, cannot remove rels") - # Still proceed with removal even if object can't be loaded + Uses the same caching mechanism as read_array for efficiency. + Supports RESQML v2.2 partial writes via start_indices. - # Step 1: Compute rels updates (remove DESTINATION relationships from referenced objects) - rels_updates: Dict[str, str] = {} # rels_path -> XML content + Args: + proxy: The object, its identifier, or URI + path_in_external: Path/dataset name within the external file + array: Numpy array to write + start_indices: Optional start index for each dimension for partial writes + external_uri: Optional URI to override file path resolution + **kwargs: Additional format-specific parameters (e.g., dtype for HDF5, column_titles for Parquet) - if obj is not None: - with self._get_zip_file() as zf: - direct_dors = get_direct_dor_list(obj) + Returns: + True if successful, False otherwise + """ + # Get possible file paths for this object + file_paths = [] - for dor in direct_dors: - try: - target_identifier = get_obj_identifier(dor) - if target_identifier not in self._metadata: - continue + if external_uri is not None: + # Use external_uri if provided (RESQML v2.2) + epc_folder = os.path.dirname(self.epc_file_path) if self.epc_file_path else "." + if os.path.isabs(external_uri): + file_paths = [external_uri] + else: + file_paths = [os.path.join(epc_folder, external_uri), external_uri] + elif self.force_h5_path is not None: + # Use forced path if specified + file_paths = [self.force_h5_path] + else: + # Get file paths from relationships + file_paths = self.get_h5_file_paths(proxy) - # Use metadata to generate rels path without loading the object - target_rels_path = self._gen_rels_path_from_identifier(target_identifier) - if target_rels_path is None: - continue + if not file_paths: + logging.warning(f"No external file paths found for proxy: {proxy}") + return False - # Read existing rels - existing_relationships = [] - try: - if target_rels_path in zf.namelist(): - rels_data = zf.read(target_rels_path) - existing_rels = read_energyml_xml_bytes(rels_data, Relationships) - if existing_rels and existing_rels.relationship: - existing_relationships = list(existing_rels.relationship) - except Exception as e: - logging.debug(f"Could not read existing rels for {target_identifier}: {e}") + # Get the file handler registry + handler_registry = get_handler_registry() - # Remove DESTINATION relationship that pointed to our object - updated_relationships = [ - r - for r in existing_relationships - if not ( - r.target == metadata.file_path - and r.type_value == EPCRelsRelationshipType.DESTINATION_OBJECT.get_type() - ) - ] + # Try to write to the first available file + for file_path in file_paths: + # Get the appropriate handler for this file type + handler = handler_registry.get_handler_for_file(file_path) + if handler is None: + logging.debug(f"No handler found for file: {file_path}") + continue - # Only update if relationships remain - if updated_relationships: - rels_updates[target_rels_path] = serialize_xml( - Relationships(relationship=updated_relationships) - ) + try: + # Use handler to write array with optional partial write support + success = handler.write_array(file_path, array, path_in_external, start_indices, **kwargs) + if success: + return True + except Exception as e: + logging.error(f"Failed to write dataset to {file_path}: {e}") - except Exception as e: - logging.warning(f"Failed to update rels for referenced object during removal: {e}") + logging.error(f"Failed to write array to any available file paths: {file_paths}") + return False - # Update [Content_Types].xml - content_types_xml = self._update_content_types_xml(zf, metadata, add=False) - else: - # If we couldn't load the object, still update content types - with self._get_zip_file() as zf: - content_types_xml = self._update_content_types_xml(zf, metadata, add=False) + def get_array_metadata( + self, + proxy: Union[str, Uri, Any], + path_in_external: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + ) -> Union[DataArrayMetadata, List[DataArrayMetadata], None]: + """ + Get metadata for data array(s) without loading the full array data. + Supports RESQML v2.2 sub-array selection metadata. - # Step 2: Write everything to new ZIP - with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: - temp_path = temp_file.name + Args: + proxy: The object, its identifier, or URI + path_in_external: Optional specific array path. If None, returns metadata for all arrays. + start_indices: Optional start index for each dimension (auto-extracted from proxy if not provided) + counts: Optional count of elements for each dimension (auto-extracted from proxy if not provided). + When provided, the returned dimensions will reflect the sub-selected size. - try: - with self._get_zip_file() as source_zf: - with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zf: - # Write updated [Content_Types].xml - target_zf.writestr(get_epc_content_type_path(), content_types_xml) + Returns: + DataArrayMetadata if path specified, List[DataArrayMetadata] if no path, + or None if not found. The dimensions field reflects the sub-selection when counts provided. + """ + # Get possible file paths for this object + file_paths = [] - # Write updated rels files - for rels_path, rels_xml in rels_updates.items(): - target_zf.writestr(rels_path, rels_xml) + if self.force_h5_path is not None: + file_paths = [self.force_h5_path] + else: + file_paths = self.get_h5_file_paths(proxy) - # Copy all files except removed object, its rels, and files we're updating - obj_rels_path = self._gen_rels_path_from_metadata(metadata) - files_to_skip = {get_epc_content_type_path(), metadata.file_path} - if obj_rels_path: - files_to_skip.add(obj_rels_path) - files_to_skip.update(rels_updates.keys()) + if not file_paths: + logging.warning(f"No external file paths found for proxy: {proxy}") + return None + # Get the file handler registry + handler_registry = get_handler_registry() + + for file_path in file_paths: + # Get the appropriate handler for this file type + handler = handler_registry.get_handler_for_file(file_path) + if handler is None: + logging.debug(f"No handler found for file: {file_path}") + continue - for item in source_zf.infolist(): - if item.filename not in files_to_skip: - buffer = source_zf.read(item.filename) - target_zf.writestr(item, buffer) + try: + # Use handler to get metadata without loading full array + metadata_dict = handler.get_array_metadata(file_path, path_in_external, start_indices, counts) - # Replace original - shutil.move(temp_path, self.epc_file_path) - self._reopen_persistent_zip() + if metadata_dict is None: + continue - except Exception as e: - if os.path.exists(temp_path): - os.unlink(temp_path) - logging.error(f"Failed to remove object from EPC file: {e}") - raise + # Convert dict(s) to DataArrayMetadata + if isinstance(metadata_dict, list): + return [ + DataArrayMetadata( + path_in_resource=m.get("path"), + array_type=m.get("dtype", "unknown"), + dimensions=m.get("shape", []), + start_indices=start_indices, + custom_data={"size": m.get("size", 0)}, + ) + for m in metadata_dict + ] + else: + return DataArrayMetadata( + path_in_resource=metadata_dict.get("path"), + array_type=metadata_dict.get("dtype", "unknown"), + dimensions=metadata_dict.get("shape", []), + start_indices=start_indices, + custom_data={"size": metadata_dict.get("size", 0)}, + ) + except Exception as e: + logging.debug(f"Failed to get metadata from file {file_path}: {e}") - def _update_content_types_xml( - self, source_zip: zipfile.ZipFile, metadata: EpcObjectMetadata, add: bool = True - ) -> str: - """Update [Content_Types].xml to add or remove object entry. + return None - Delegates to _metadata_mgr.update_content_types_xml() - """ - return self._metadata_mgr.update_content_types_xml(source_zip, metadata, add) + def list_objects( + self, dataspace: Optional[str] = None, object_type: Optional[str] = None + ) -> List[ResourceMetadata]: + return [m.to_resource_metadata() for m in self._metadata_mgr.list_metadata(qualified_type_filter=object_type)] - def _rollback_add_object(self, identifier: Optional[str]) -> None: - """Rollback changes made during failed add_object operation.""" - if identifier and identifier in self._metadata: - metadata = self._metadata[identifier] + def get_obj_rels(self, obj: Union[str, Uri, Any]) -> List[Relationship]: + _id = self._id_from_uri_or_identifier(obj) - # Remove from metadata - del self._metadata[identifier] + if _id is None: + logging.warning(f"Could not resolve identifier for object {obj}, cannot get relationships") + return [] - # Remove from indexes - uuid = metadata.uuid - object_type = metadata.object_type + metadata = self._metadata_mgr.get_metadata(_id) + if metadata is None: + logging.warning(f"Object with identifier {_id} not found in metadata, cannot get relationships") + return [] - if uuid in self._uuid_index and identifier in self._uuid_index[uuid]: - self._uuid_index[uuid].remove(identifier) - if not self._uuid_index[uuid]: - del self._uuid_index[uuid] + return self._rels_mgr.get_obj_rels(_id) - if object_type in self._type_index and identifier in self._type_index[object_type]: - self._type_index[object_type].remove(identifier) - if not self._type_index[object_type]: - del self._type_index[object_type] + def close(self) -> None: + """Close the persistent ZIP file if it's open, recomputing rels first if mode is UPDATE_ON_CLOSE.""" + # Unregister atexit handler to avoid double-close + if getattr(self, "_atexit_registered", False): + atexit.unregister(self._atexit_close) + self._atexit_registered = False - # Remove from cache - if identifier in self._object_cache: - del self._object_cache[identifier] - if identifier in self._access_order: - self._access_order.remove(identifier) + # Recompute all relationships before closing if in UPDATE_ON_CLOSE mode + if self.rels_update_mode == RelsUpdateMode.UPDATE_ON_CLOSE: + try: + self.rebuild_all_rels(clean_first=True) + logging.info("Rebuilt all relationships on close (UPDATE_ON_CLOSE mode)") + except Exception as e: + logging.warning(f"Error rebuilding rels on close: {e}") - def clean_rels(self) -> Dict[str, int]: - """ - Clean all .rels files by removing relationships to objects that no longer exist. + # Close file cache + if hasattr(self, "_file_cache"): + self._file_cache.close_all() - This method: - 1. Scans all .rels files in the EPC - 2. For each relationship, checks if the target object exists - 3. Removes relationships pointing to non-existent objects - 4. Removes empty .rels files + # Close cached h5 if using force_h5_path + if self.cache_opened_h5 is not None: + try: + self.cache_opened_h5.close() + except Exception as e: + logging.debug(f"Error closing cache_opened_h5: {e}") + self.cache_opened_h5 = None - Returns: - Dictionary with statistics: - - 'rels_files_scanned': Number of .rels files examined - - 'relationships_removed': Number of orphaned relationships removed - - 'rels_files_removed': Number of empty .rels files removed - """ - import tempfile - import shutil + # Delegate to ZIP accessor + self._zip_accessor.close() - stats = { - "rels_files_scanned": 0, - "relationships_removed": 0, - "rels_files_removed": 0, - } + def get_object_dependencies(self, identifier: Union[str, Uri]) -> List[str]: + return list(get_dor_identifiers_from_obj(self.get_object(identifier))) - # Create temporary file for updated EPC - with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: - temp_path = temp_file.name + def start_transaction(self) -> bool: + raise NotImplementedError("Transactions are not implemented in this version of EpcStreamReader") - try: - with self._get_zip_file() as source_zip: - with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zip: - # Get all existing object file paths for validation - existing_object_files = {metadata.file_path for metadata in self._metadata.values()} + def commit_transaction(self) -> Tuple[bool, Optional[str]]: + raise NotImplementedError("Transactions are not implemented in this version of EpcStreamReader") - # Process each file - for item in source_zip.infolist(): - if item.filename.endswith(".rels"): - # Process .rels file - stats["rels_files_scanned"] += 1 - - try: - rels_data = source_zip.read(item.filename) - rels_obj = read_energyml_xml_bytes(rels_data, Relationships) - - if rels_obj and rels_obj.relationship: - # Filter out relationships to non-existent objects - original_count = len(rels_obj.relationship) - - # Keep only relationships where the target exists - # or where the target is external (starts with ../ or http) - valid_relationships = [] - for rel in rels_obj.relationship: - target = rel.target - # Keep external references (HDF5, etc.) and existing objects - if ( - target.startswith("../") - or target.startswith("http") - or target in existing_object_files - or target.lstrip("/") - in existing_object_files # Also check without leading slash - ): - valid_relationships.append(rel) - - removed_count = original_count - len(valid_relationships) - stats["relationships_removed"] += removed_count - - if removed_count > 0: - logging.info( - f"Removed {removed_count} orphaned relationships from {item.filename}" - ) - - # Only write the .rels file if it has remaining relationships - if valid_relationships: - rels_obj.relationship = valid_relationships - updated_rels = serialize_xml(rels_obj) - target_zip.writestr(item.filename, updated_rels) - else: - # Empty .rels file, don't write it - stats["rels_files_removed"] += 1 - logging.info(f"Removed empty .rels file: {item.filename}") - else: - # Empty or invalid .rels, don't copy it - stats["rels_files_removed"] += 1 - - except Exception as e: - logging.warning(f"Failed to process .rels file {item.filename}: {e}") - # Copy as-is on error - data = source_zip.read(item.filename) - target_zip.writestr(item, data) + def rollback_transaction(self) -> bool: + raise NotImplementedError("Transactions are not implemented in this version of EpcStreamReader") - else: - # Copy non-.rels files as-is - data = source_zip.read(item.filename) - target_zip.writestr(item, data) + def __enter__(self): + """Context manager entry.""" + return self - # Replace original file - shutil.move(temp_path, self.epc_file_path) + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit with cleanup.""" + self.clear_cache() + self.close() + # Note: close() now handles cache_opened_h5 - logging.info( - f"Cleaned .rels files: scanned {stats['rels_files_scanned']}, " - f"removed {stats['relationships_removed']} orphaned relationships, " - f"removed {stats['rels_files_removed']} empty .rels files" - ) + def __len__(self) -> int: + """Return total number of objects.""" + return len(self._metadata) - return stats + def __iter__(self) -> Iterator[str]: + """Iterate over object identifiers.""" + return iter(self._metadata.keys()) - except Exception as e: - # Clean up temp file on error - if os.path.exists(temp_path): - os.unlink(temp_path) - raise RuntimeError(f"Failed to clean .rels files: {e}") + # ____ ____ _____ _____ ____________ + # / __ \/ __ \/ _/ | / / |/_ __/ ____/ + # / /_/ / /_/ // / | | / / /| | / / / __/ + # / ____/ _, _// / | |/ / ___ |/ / / /___ + # /_/ /_/ |_/___/ |___/_/ |_/_/ /_____/ - def rebuild_all_rels(self, clean_first: bool = True) -> Dict[str, int]: - """ - Rebuild all .rels files from scratch by analyzing all objects and their references. + def __del__(self): + """Destructor to ensure persistent ZIP file is closed.""" + try: + self.close() + if self.cache_opened_h5 is not None: + try: + self.cache_opened_h5.close() + except Exception: + pass + self.cache_opened_h5 = None + except Exception: + pass # Ignore errors during cleanup - This method: - 1. Optionally cleans existing .rels files first - 2. Loads each object temporarily - 3. Analyzes its Data Object References (DORs) - 4. Creates/updates .rels files with proper SOURCE and DESTINATION relationships + def _atexit_close(self) -> None: + """Atexit callback — performs minimal cleanup without rebuilding rels.""" + try: + self._zip_accessor.close() + except Exception: + pass - Args: - clean_first: If True, remove all existing .rels files before rebuilding + def _update_access_order(self, identifier: str) -> None: + """Update access order for LRU cache.""" + if identifier in self._access_order: + self._access_order.remove(identifier) + self._access_order.insert(0, identifier) - Returns: - Dictionary with statistics: - - 'objects_processed': Number of objects analyzed - - 'rels_files_created': Number of .rels files created - - 'source_relationships': Number of SOURCE relationships created - - 'destination_relationships': Number of DESTINATION relationships created - - 'parallel_mode': True if parallel processing was used (optional key) - - 'execution_time': Execution time in seconds (optional key) - """ - if self.enable_parallel_rels: - return self._rebuild_all_rels_parallel(clean_first) - else: - return self._rebuild_all_rels_sequential(clean_first) + def _id_from_uri_or_identifier( + self, identifier: Union[str, Uri, Any], get_first_if_simple_uuid: bool = True + ) -> Optional[str]: + try: + return as_identifier(identifier) + except Exception: + if not get_first_if_simple_uuid: + logging.warning( + f"Identifier {identifier} is a simple UUID, but get_first_if_simple_uuid is False, cannot resolve to full identifier" + ) + return None + # If it's a simple UUID, we need to find the corresponding identifier from metadata + t_metadata_identifiers = self._metadata_mgr.get_uuid_identifiers(identifier) + if t_metadata_identifiers is not None and len(t_metadata_identifiers) > 0: + return t_metadata_identifiers[ + 0 + ] # If multiple metadata entries for the same UUID, we take the first one (this should not happen in a well-formed EPC file) + else: + logging.warning(f"No metadata found for UUID {identifier}, cannot get relationships") + return None def _rebuild_all_rels_sequential(self, clean_first: bool = True) -> Dict[str, int]: """ @@ -2829,9 +2083,6 @@ def _rebuild_all_rels_sequential(self, clean_first: bool = True) -> Dict[str, in - 'source_relationships': Number of SOURCE relationships created - 'destination_relationships': Number of DESTINATION relationships created """ - import tempfile - import shutil - stats = { "objects_processed": 0, "rels_files_created": 0, @@ -2848,12 +2099,15 @@ def _rebuild_all_rels_sequential(self, clean_first: bool = True) -> Dict[str, in # First pass: analyze all objects and build the reference map for identifier in self._metadata: try: - obj = self.get_object_by_identifier(identifier) + obj = self.get_object(identifier) if obj is None: continue stats["objects_processed"] += 1 + # Extract this object's type + obj_type = get_obj_type(get_obj_usable_class(obj)) + # Get all DORs in this object dors = get_direct_dor_list(obj) @@ -2861,10 +2115,10 @@ def _rebuild_all_rels_sequential(self, clean_first: bool = True) -> Dict[str, in try: target_identifier = get_obj_identifier(dor) if target_identifier in self._metadata: - # Record this reference + # Record this reference (for building SOURCE rels in target's file) if target_identifier not in reverse_references: reverse_references[target_identifier] = [] - reverse_references[target_identifier].append((identifier, obj)) + reverse_references[target_identifier].append((identifier, obj_type)) except Exception: pass @@ -2875,21 +2129,20 @@ def _rebuild_all_rels_sequential(self, clean_first: bool = True) -> Dict[str, in # Map of rels_file_path -> Relationships object rels_files: Dict[str, Relationships] = {} - # Process each object to create SOURCE relationships + # Process each object to create DESTINATION relationships for identifier in self._metadata: try: - obj = self.get_object_by_identifier(identifier) + obj = self.get_object(identifier) if obj is None: continue - # metadata = self._metadata[identifier] - obj_rels_path = self._gen_rels_path_from_identifier(identifier) + obj_rels_path = self._metadata_mgr.gen_rels_path_from_identifier(identifier) # Get all DORs (objects this object references) dors = get_direct_dor_list(obj) if dors: - # Create SOURCE relationships + # Create DESTINATION relationships (this object -> targets it references) relationships = [] for dor in dors: @@ -2897,17 +2150,20 @@ def _rebuild_all_rels_sequential(self, clean_first: bool = True) -> Dict[str, in target_identifier = get_obj_identifier(dor) if target_identifier in self._metadata: target_metadata = self._metadata[target_identifier] - + target_type = get_obj_type(get_obj_usable_class(dor)) + target_path = target_metadata.file_path( + export_version=self._metadata_mgr._export_version + ) rel = Relationship( - target=target_metadata.file_path, - type_value=EPCRelsRelationshipType.SOURCE_OBJECT.get_type(), - id=f"_{identifier}_{get_obj_type(get_obj_usable_class(dor))}_{target_identifier}", + target=target_path, + type_value=get_rels_dor_type(dor_target=target_path, in_dor_owner_rels_file=True), + id=f"_{identifier}_{target_type}_{target_identifier}", ) relationships.append(rel) - stats["source_relationships"] += 1 + stats["destination_relationships"] += 1 except Exception as e: - logging.debug(f"Failed to create SOURCE relationship: {e}") + logging.debug(f"Failed to create DESTINATION relationship: {e}") if relationships and obj_rels_path: if obj_rels_path not in rels_files: @@ -2915,50 +2171,60 @@ def _rebuild_all_rels_sequential(self, clean_first: bool = True) -> Dict[str, in rels_files[obj_rels_path].relationship.extend(relationships) except Exception as e: - logging.warning(f"Failed to create SOURCE rels for {identifier}: {e}") + logging.warning(f"Failed to create DESTINATION rels for {identifier}: {e}") - # Add DESTINATION relationships + # Add SOURCE relationships (in target's .rels file, pointing back to sources) for target_identifier, source_list in reverse_references.items(): try: if target_identifier not in self._metadata: continue target_metadata = self._metadata[target_identifier] - target_rels_path = self._gen_rels_path_from_identifier(target_identifier) + target_rels_path = self._metadata_mgr.gen_rels_path_from_identifier(target_identifier) if not target_rels_path: continue - # Create DESTINATION relationships for each object that references this one - for source_identifier, source_obj in source_list: + # Create SOURCE relationships for each object that references this one + for source_identifier, source_type in source_list: try: source_metadata = self._metadata[source_identifier] rel = Relationship( - target=source_metadata.file_path, - type_value=EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(), - id=f"_{target_identifier}_{get_obj_type(get_obj_usable_class(source_obj))}_{source_identifier}", + target=source_metadata.file_path(export_version=self._metadata_mgr._export_version), + type_value=get_rels_dor_type(dor_target=target_rels_path, in_dor_owner_rels_file=False), + id=f"_{target_identifier}_{source_type}_{source_identifier}", ) if target_rels_path not in rels_files: rels_files[target_rels_path] = Relationships(relationship=[]) rels_files[target_rels_path].relationship.append(rel) - stats["destination_relationships"] += 1 + stats["source_relationships"] += 1 except Exception as e: - logging.debug(f"Failed to create DESTINATION relationship: {e}") + logging.debug(f"Failed to create SOURCE relationship: {e}") except Exception as e: - logging.warning(f"Failed to create DESTINATION rels for {target_identifier}: {e}") + logging.warning(f"Failed to create SOURCE rels for {target_identifier}: {e}") stats["rels_files_created"] = len(rels_files) # Before writing, preserve EXTERNAL_RESOURCE and other non-SOURCE/DESTINATION relationships # This includes rels files that may not be in rels_files yet - with self._get_zip_file() as zf: + # Also collect content types and core property extended files + core_prop_extended_files = set() + core_prop_rels = None + c_types: Optional[Types] = None + + with self._zip_accessor.get_zip_file() as zf: + c_types = self._metadata_mgr.get_content_type(zf) # Check all existing .rels files for filename in zf.namelist(): if not filename.endswith(".rels"): + if is_core_prop_or_extension_path(filename) and not filename.endswith(gen_core_props_path()): + core_prop_extended_files.add(filename) + if filename == gen_core_props_rels_path(): + core_prop_rels = read_energyml_xml_bytes(zf.read(filename), Relationships) continue try: @@ -2971,8 +2237,8 @@ def _rebuild_all_rels_sequential(self, clean_first: bool = True) -> Dict[str, in for r in existing_rels_obj.relationship if r.type_value not in ( - EPCRelsRelationshipType.SOURCE_OBJECT.get_type(), - EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(), + str(EPCRelsRelationshipType.SOURCE_OBJECT), + str(EPCRelsRelationshipType.DESTINATION_OBJECT), ) ] if preserved_rels: @@ -2985,16 +2251,43 @@ def _rebuild_all_rels_sequential(self, clean_first: bool = True) -> Dict[str, in except Exception as e: logging.debug(f"Could not preserve existing rels from {filename}: {e}") + # Update core_prop_rels with extended props if needed + new_core_prop_rels = Relationships( + relationship=[ + Relationship( + target=e_path, + type_value=str(EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES), + ) + for e_path in core_prop_extended_files + ] + ) + if core_prop_rels is None: + core_prop_rels = new_core_prop_rels + else: + for new_rel in new_core_prop_rels.relationship: + found = False + for existing_rel in core_prop_rels.relationship: + if existing_rel.target == new_rel.target and existing_rel.type_value == new_rel.type_value: + found = True + break + + if not found: + core_prop_rels.relationship.append(new_rel) + + rels_files[gen_core_props_rels_path()] = core_prop_rels + # Third pass: write the new EPC with updated .rels files with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: temp_path = temp_file.name try: - with self._get_zip_file() as source_zip: + with self._zip_accessor.get_zip_file() as source_zip: with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zip: - # Copy all non-.rels files + # Copy all non-.rels files (excluding [Content_Types].xml which will be regenerated) for item in source_zip.infolist(): - if not (item.filename.endswith(".rels") and clean_first): + if not (item.filename.endswith(".rels") and clean_first) and ( + c_types is None or item.filename != get_epc_content_type_path() + ): data = source_zip.read(item.filename) target_zip.writestr(item, data) @@ -3003,9 +2296,14 @@ def _rebuild_all_rels_sequential(self, clean_first: bool = True) -> Dict[str, in rels_xml = serialize_xml(rels_obj) target_zip.writestr(rels_path, rels_xml) + if c_types is not None: + # Write the new generated [Content_Types].xml + c_types_xml = serialize_xml(c_types) + target_zip.writestr(get_epc_content_type_path(), c_types_xml) + # Replace original file shutil.move(temp_path, self.epc_file_path) - self._reopen_persistent_zip() + self._zip_accessor.reopen_persistent_zip() logging.info( f"Rebuilt .rels files: processed {stats['objects_processed']} objects, " @@ -3035,8 +2333,6 @@ def _rebuild_all_rels_parallel(self, clean_first: bool = True) -> Dict[str, int] This bypasses Python's GIL for CPU-intensive XML parsing and provides significant speedup for large EPCs (tested with 80+ objects). """ - import tempfile - import shutil import time from multiprocessing import Pool, cpu_count @@ -3056,7 +2352,10 @@ def _rebuild_all_rels_parallel(self, clean_first: bool = True) -> Dict[str, int] # Prepare work items for parallel processing # Pass metadata as dict (serializable) instead of keeping references metadata_dict = {k: v for k, v in self._metadata.items()} - work_items = [(identifier, str(self.epc_file_path), metadata_dict) for identifier in self._metadata] + export_version = self._metadata_mgr._export_version + work_items = [ + ((identifier, str(self.epc_file_path), metadata_dict), export_version) for identifier in self._metadata + ] # Determine optimal number of workers based on available CPUs and workload # Don't spawn more workers than CPUs; use user-configurable ratio for workload per worker @@ -3069,95 +2368,89 @@ def _rebuild_all_rels_parallel(self, clean_first: bool = True) -> Dict[str, int] # ============================================================================ results = [] with Pool(processes=num_workers) as pool: - results = pool.map(_process_object_for_rels_worker, work_items) + results = pool.starmap(process_object_for_rels_worker, work_items) # ============================================================================ - # PHASE 2: SEQUENTIAL - Aggregate worker results + # PHASE 2: SEQUENTIAL - Aggregate worker results and build DESTINATION relationships # ============================================================================ # Build data structures for subsequent phases: - # - reverse_references: Map target objects to their sources (for DESTINATION rels) + # - reverse_references: Map target objects to their sources (for SOURCE rels in target) # - rels_files: Accumulate all relationships by file path - # - object_types: Cache object types to eliminate redundant loads in Phase 3 - reverse_references: Dict[str, List[Tuple[str, str]]] = {} + reverse_references: Dict[str, List[Tuple[str, str]]] = {} # target_id -> [(source_id, source_type)] rels_files: Dict[str, Relationships] = {} - object_types: Dict[str, str] = {} for result in results: if result is None: continue identifier = result["identifier"] - obj_type = result["object_type"] - source_rels = result["source_rels"] - dor_targets = result["dor_targets"] - - # Cache object type - object_types[identifier] = obj_type + object_type = result["object_type"] + referenced_objects = result["referenced_objects"] stats["objects_processed"] += 1 - # Convert dicts back to Relationship objects - if source_rels: - obj_rels_path = self._gen_rels_path_from_identifier(identifier) - if obj_rels_path: - relationships = [] - for rel_dict in source_rels: - rel = Relationship( - target=rel_dict["target"], - type_value=rel_dict["type_value"], - id=rel_dict["id"], - ) - relationships.append(rel) - stats["source_relationships"] += 1 + # Create DESTINATION relationships for this object (objects this one references) + obj_rels_path = self._metadata_mgr.gen_rels_path_from_identifier(identifier) + if obj_rels_path and referenced_objects: + if obj_rels_path not in rels_files: + rels_files[obj_rels_path] = Relationships(relationship=[]) + + for target_identifier, target_type in referenced_objects: + # Verify target exists in metadata + if target_identifier not in self._metadata: + continue + + target_metadata = self._metadata[target_identifier] + target_path = target_metadata.file_path(export_version=export_version) - if obj_rels_path not in rels_files: - rels_files[obj_rels_path] = Relationships(relationship=[]) - rels_files[obj_rels_path].relationship.extend(relationships) + # Create DESTINATION relationship (this object -> target) + rel = Relationship( + target=target_path, + type_value=get_rels_dor_type(dor_target=target_path, in_dor_owner_rels_file=True), + id=f"_{identifier}_{target_type}_{target_identifier}", + ) + rels_files[obj_rels_path].relationship.append(rel) + stats["destination_relationships"] += 1 - # Build reverse reference map for DESTINATION relationships - # dor_targets now contains (target_id, target_type) tuples - for target_identifier, target_type in dor_targets: - if target_identifier not in reverse_references: - reverse_references[target_identifier] = [] - reverse_references[target_identifier].append((identifier, obj_type)) + # Build reverse reference map for SOURCE relationships + if target_identifier not in reverse_references: + reverse_references[target_identifier] = [] + reverse_references[target_identifier].append((identifier, object_type)) # ============================================================================ - # PHASE 3: SEQUENTIAL - Create DESTINATION relationships (zero object loading!) + # PHASE 3: SEQUENTIAL - Create SOURCE relationships # ============================================================================ - # Use cached object types from Phase 2 to build DESTINATION relationships - # without reloading any objects. This optimization is critical for performance. for target_identifier, source_list in reverse_references.items(): try: if target_identifier not in self._metadata: continue - target_rels_path = self._gen_rels_path_from_identifier(target_identifier) + target_rels_path = self._metadata_mgr.gen_rels_path_from_identifier(target_identifier) if not target_rels_path: continue - # Use cached object types instead of loading objects! for source_identifier, source_type in source_list: try: source_metadata = self._metadata[source_identifier] - # No object loading needed - we have all the type info from Phase 2! + # Create SOURCE relationship (source object -> this target object) rel = Relationship( - target=source_metadata.file_path, - type_value=EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(), + target=source_metadata.file_path(export_version=export_version), + type_value=get_rels_dor_type(dor_target=target_rels_path, in_dor_owner_rels_file=False), id=f"_{target_identifier}_{source_type}_{source_identifier}", ) if target_rels_path not in rels_files: rels_files[target_rels_path] = Relationships(relationship=[]) rels_files[target_rels_path].relationship.append(rel) - stats["destination_relationships"] += 1 + stats["source_relationships"] += 1 except Exception as e: - logging.debug(f"Failed to create DESTINATION relationship: {e}") + logging.debug(f"Failed to create SOURCE relationship: {e}") except Exception as e: - logging.warning(f"Failed to create DESTINATION rels for {target_identifier}: {e}") + logging.warning(f"Failed to create SOURCE rels for {target_identifier}: {e}") stats["rels_files_created"] = len(rels_files) @@ -3165,9 +2458,23 @@ def _rebuild_all_rels_parallel(self, clean_first: bool = True) -> Dict[str, int] # PHASE 4: SEQUENTIAL - Preserve non-object relationships # ============================================================================ # Preserve EXTERNAL_RESOURCE and other non-standard relationship types - with self._get_zip_file() as zf: + + # media_files = set() + core_prop_extended_files = set() + core_prop_rels = None + c_types: Optional[Types] = None + + with self._zip_accessor.get_zip_file() as zf: + c_types = self._metadata_mgr.get_content_type(zf) for filename in zf.namelist(): + # if not filename.endswith(".rels") and not filename.endswith(".xml"): + # media_files.add(filename) + if not filename.endswith(".rels"): + if is_core_prop_or_extension_path(filename) and not filename.endswith(gen_core_props_path()): + core_prop_extended_files.add(filename) + if filename == gen_core_props_rels_path(): + core_prop_rels = read_energyml_xml_bytes(zf.read(filename), Relationships) continue try: @@ -3179,8 +2486,8 @@ def _rebuild_all_rels_parallel(self, clean_first: bool = True) -> Dict[str, int] for r in existing_rels_obj.relationship if r.type_value not in ( - EPCRelsRelationshipType.SOURCE_OBJECT.get_type(), - EPCRelsRelationshipType.DESTINATION_OBJECT.get_type(), + str(EPCRelsRelationshipType.SOURCE_OBJECT), + str(EPCRelsRelationshipType.DESTINATION_OBJECT), ) ] if preserved_rels: @@ -3191,19 +2498,48 @@ def _rebuild_all_rels_parallel(self, clean_first: bool = True) -> Dict[str, int] except Exception as e: logging.debug(f"Could not preserve existing rels from {filename}: {e}") + # update core_prop_rels with extended props if needed + new_core_prop_rels = Relationships( + relationship=[ + Relationship( + target=e_path, + type_value=str(EPCRelsRelationshipType.EXTENDED_CORE_PROPERTIES), + ) + for e_path in core_prop_extended_files + ] + ) + if core_prop_rels is None: + core_prop_rels = new_core_prop_rels + else: + for new_rel in new_core_prop_rels.relationship: + found = False + for existing_rel in core_prop_rels.relationship: + if existing_rel.target == new_rel.target and existing_rel.type_value == new_rel.type_value: + found = True + break + + if not found: + core_prop_rels.relationship.append(new_rel) + + rels_files[gen_core_props_rels_path()] = core_prop_rels + print(f"Coreprops : {core_prop_rels}") + # ============================================================================ # PHASE 5: SEQUENTIAL - Write all relationships to ZIP file # ============================================================================ + # ZIP file writing must be sequential (file format limitation) with tempfile.NamedTemporaryFile(delete=False, suffix=".epc") as temp_file: temp_path = temp_file.name try: - with self._get_zip_file() as source_zip: + with self._zip_accessor.get_zip_file() as source_zip: with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as target_zip: # Copy all non-.rels files for item in source_zip.infolist(): - if not (item.filename.endswith(".rels") and clean_first): + if not (item.filename.endswith(".rels") and clean_first) and ( + c_types is None or item.filename != get_epc_content_type_path() + ): data = source_zip.read(item.filename) target_zip.writestr(item, data) @@ -3212,9 +2548,14 @@ def _rebuild_all_rels_parallel(self, clean_first: bool = True) -> Dict[str, int] rels_xml = serialize_xml(rels_obj) target_zip.writestr(rels_path, rels_xml) + if c_types is not None: + # writing the new new generated [Content_Types].xml with the new media files if any + c_types_xml = serialize_xml(c_types) + target_zip.writestr(get_epc_content_type_path(), c_types_xml) + # Replace original file shutil.move(temp_path, self.epc_file_path) - self._reopen_persistent_zip() + self._zip_accessor.reopen_persistent_zip() execution_time = time.time() - start_time stats["execution_time"] = execution_time @@ -3234,70 +2575,17 @@ def _rebuild_all_rels_parallel(self, clean_first: bool = True) -> Dict[str, int] os.unlink(temp_path) raise RuntimeError(f"Failed to rebuild .rels files (parallel): {e}") - def __repr__(self) -> str: - """String representation.""" - return ( - f"EpcStreamReader(path='{self.epc_file_path}', " - f"objects={len(self._metadata)}, " - f"cached={len(self._object_cache)}, " - f"cache_hit_rate={self.stats.cache_hit_rate:.1f}%)" - ) - - def dumps_epc_content_and_files_lists(self): - """Dump EPC content and files lists for debugging.""" - content_list = [] - file_list = [] - - with self._get_zip_file() as zf: - file_list = zf.namelist() - - for item in zf.infolist(): - content_list.append(f"{item.filename} - {item.file_size} bytes") - - return { - "content_list": sorted(content_list), - "file_list": sorted(file_list), - } - - -# Utility functions for backward compatibility - - -def read_epc_stream(epc_file_path: Union[str, Path], **kwargs) -> EpcStreamReader: - """ - Factory function to create EpcStreamReader instance. - - Args: - epc_file_path: Path to EPC file - **kwargs: Additional arguments for EpcStreamReader - - Returns: - EpcStreamReader instance - """ - return EpcStreamReader(epc_file_path, **kwargs) - - -def convert_to_streaming_epc(epc: Epc, output_path: Optional[Union[str, Path]] = None) -> EpcStreamReader: - """ - Convert standard Epc to streaming version. - - Args: - epc: Standard Epc instance - output_path: Optional path to save EPC file - - Returns: - EpcStreamReader instance - """ - if output_path is None and epc.epc_file_path: - output_path = epc.epc_file_path - elif output_path is None: - raise ValueError("Output path must be provided if EPC doesn't have a file path") - - # Export EPC to file if needed - if not Path(output_path).exists(): - epc.export_file(str(output_path)) + # ================================================================================= + # Retro compatibility aliases (to avoid breaking changes in tests and example code) + # ================================================================================= + def remove_object(self, identifier: Union[str, Uri, Any]) -> bool: + """Alias for delete_object for backward compatibility.""" + return self.delete_object(identifier) - return EpcStreamReader(output_path) + def update_object(self, obj: Any) -> Optional[str]: + """Alias for put_object for backward compatibility.""" + return self.put_object(obj) - -__all__ = ["EpcStreamReader", "EpcObjectMetadata", "EpcStreamingStats", "read_epc_stream", "convert_to_streaming_epc"] + def get_object_by_identifier(self, identifier: Union[str, Uri]) -> Optional[Any]: + """Alias for get_object for backward compatibility.""" + return self.get_object(identifier) diff --git a/energyml-utils/src/energyml/utils/epc_utils.py b/energyml-utils/src/energyml/utils/epc_utils.py new file mode 100644 index 0000000..1a0a90b --- /dev/null +++ b/energyml-utils/src/energyml/utils/epc_utils.py @@ -0,0 +1,909 @@ +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 + + +from io import BytesIO +import json +import logging +import os +import os +from typing import Optional, Set, Tuple, Union, Any, List, Dict, Callable +from pathlib import Path +import zipfile + +from energyml.opc.opc import ( + CoreProperties, + Relationship, + Relationships, + TargetMode, + Created, + Creator, + Identifier, + Types, + Default, + Override, +) + +from energyml.utils.exception import NotEnoughInformationError + +from energyml.utils.constants import ( + CORE_PROPERTIES_FOLDER_NAME, + EPCRelsRelationshipType, + EpcExportVersion, + RELS_FOLDER_NAME, + epoch, + epoch_to_date, + extract_uuid_from_string, + file_extension_to_mime_type, + gen_uuid, + MimeType, + OptimizedRegex, + split_identifier, + content_type_to_qualified_type, + qualified_type_to_content_type, + get_property_kind_dict_path_as_dict, +) +from energyml.utils.introspection import ( + get_direct_dor_list, + get_obj_uri, + get_dor_obj_info, + get_object_type_for_file_path_from_class, + is_dor, + get_class_pkg_version, + get_obj_version, + get_obj_uuid, + get_obj_identifier, + get_object_attribute, + search_attribute_matching_type, + get_class_from_qualified_type, + set_attribute_from_path, + set_attribute_value, + get_obj_attribute_class, + copy_attributes, + get_content_type_from_class, + get_qualified_type_from_class, +) +from energyml.utils.manager import get_class_pkg +from energyml.utils.serialization import read_energyml_xml_str, serialize_xml, read_energyml_json_str +from energyml.utils.uri import Uri, parse_uri +from energyml.utils.storage_interface import ResourceMetadata + +# ____ ___ ________ __ +# / __ \/ |/_ __/ / / / +# / /_/ / /| | / / / /_/ / +# / ____/ ___ |/ / / __ / +# /_/ /_/ |_/_/ /_/ /_/ + +EXPANDED_EXPORT_FOLDER_PREFIX = "namespace_" +PATH_VERSION_PREFIX = "version_" + + +def gen_core_props_rels_path() -> str: + """ + Generate a path to store the core properties rels file into an epc file + :return: + """ + core_path = Path(gen_core_props_path()) + + return (core_path.parent / RELS_FOLDER_NAME / f"{core_path.name}.rels").as_posix() + + +def is_core_prop_or_extension_path(path: Union[str, Path]) -> bool: + """ + Check if the given path is the one for core properties or its rels file in an epc file + :param path: + :return: + """ + _path = Path(path) if not isinstance(path, Path) else path + return ( + _path.as_posix() == gen_core_props_path() + or _path.as_posix() == gen_core_props_rels_path() + or _path.as_posix().startswith(f"/{CORE_PROPERTIES_FOLDER_NAME}/") + or _path.as_posix().startswith(f"{CORE_PROPERTIES_FOLDER_NAME}/") + ) + + +def gen_core_props_path( + export_version: EpcExportVersion = EpcExportVersion.CLASSIC, +) -> str: + """ + Generate a path to store the core properties file into an epc file (depending on the :param:`export_version`) + :param export_version: the version of the EPC export to use (classic or expanded) + :return: + """ + return f"{CORE_PROPERTIES_FOLDER_NAME}/core.xml" + + +def gen_energyml_object_path( + energyml_object: Union[str, Uri, Any], + export_version: EpcExportVersion = EpcExportVersion.CLASSIC, +) -> str: + """ + Generate a path to store the :param:`energyml_object` into an epc file (depending on the :param:`export_version`) + :param energyml_object: can be either an EnergyML object or a string containing the XML representation of an EnergyML object, or a string containing the URI of an EnergyML object, or a Uri object representing an EnergyML object, or even a DOR object. + :param export_version: the version of the EPC export to use (classic or expanded) + :return: + """ + if isinstance(energyml_object, str): + if energyml_object.startswith("eml:///"): + energyml_object = parse_uri(energyml_object.strip()) + else: + energyml_object = read_energyml_xml_str(energyml_object) + if isinstance(energyml_object, Uri): + obj_type = energyml_object.object_type + uuid = energyml_object.uuid + pkg = energyml_object.domain + pkg_version = energyml_object.domain_version + object_version = energyml_object.version + elif is_dor(energyml_object): + uuid, pkg, pkg_version, obj_cls, object_version = get_dor_obj_info(energyml_object) + obj_type = get_object_type_for_file_path_from_class(obj_cls) + elif isinstance(energyml_object, CoreProperties): + return gen_core_props_path(export_version) + elif isinstance(energyml_object, Types): + return get_epc_content_type_path() + else: + obj_type = get_object_type_for_file_path_from_class(energyml_object.__class__) + # logging.debug("is_dor: ", str(is_dor(energyml_object)), "object type : " + str(obj_type)) + pkg = get_class_pkg(energyml_object) + pkg_version = get_class_pkg_version(energyml_object) + object_version = get_obj_version(energyml_object) + uuid = get_obj_uuid(energyml_object) + + if not uuid or len(uuid) == 0: + raise ValueError(f"The object must have a valid uuid to be stored in an epc file - {energyml_object}") + if not obj_type or len(obj_type) == 0: + raise ValueError(f"The object must have a valid type to be stored in an epc file - {energyml_object}") + if not pkg or len(pkg) == 0: + raise ValueError(f"The object must have a valid package to be stored in an epc file - {energyml_object}") + if not pkg_version or len(pkg_version) == 0: + raise ValueError( + f"The object must have a valid package version to be stored in an epc file - {energyml_object}" + ) + + if export_version == EpcExportVersion.EXPANDED: + # TODO: verify if we need to add a "/" at the begining of the path or not + return f"{EXPANDED_EXPORT_FOLDER_PREFIX}{pkg}{pkg_version.replace('.', '')}/{(PATH_VERSION_PREFIX + object_version + '/') if object_version is not None and len(object_version) > 0 else ''}{obj_type}_{uuid}.xml" + else: + return obj_type + "_" + uuid + ".xml" + + +def gen_rels_path( + energyml_object: Any, + export_version: EpcExportVersion = EpcExportVersion.CLASSIC, +) -> str: + """ + Generate a path to store the :param:`energyml_object` rels file into an epc file + (depending on the :param:`export_version`) + :param energyml_object: + :param export_version: + :return: + """ + if isinstance(energyml_object, CoreProperties): + return gen_core_props_rels_path() + elif isinstance(energyml_object, Types): + return get_epc_content_type_rels_path() + else: + obj_path = Path(gen_energyml_object_path(energyml_object, export_version)) + return gen_rels_path_from_obj_path(obj_path=obj_path) + + +def gen_rels_path_from_obj_path(obj_path: Union[str, Path]) -> str: + """ + Generate a path to store the rels file into an epc file from the object path + :param obj_path: the path of the object file (e.g. "namespace_pkg1.0/version_1.0/ObjType_uuid.xml" or "ObjType_uuid.xml") + :return: the path to store the rels file (e.g. "namespace_pkg1.0/version_1.0/_rels/ObjType_uuid.xml.rels" or "_rels/ObjType_uuid.xml.rels") + """ + _obj_path = Path(obj_path) if not isinstance(obj_path, Path) else obj_path + if _obj_path.parent.name == RELS_FOLDER_NAME: + raise ValueError(f"The object path cannot be in the '{RELS_FOLDER_NAME}' folder") + return (_obj_path.parent / RELS_FOLDER_NAME / f"{_obj_path.name}.rels").as_posix() + + +def get_epc_content_type_path( + # export_version: EpcExportVersion = EpcExportVersion.CLASSIC, +) -> str: + """ + Generate a path to store the "[Content_Types].xml" file into an epc file + :return: + """ + return "[Content_Types].xml" + + +def get_epc_content_type_rels_path() -> str: + """Generate a path to store the rels file for "[Content_Types].xml" into an epc file :return:""" + return f"{RELS_FOLDER_NAME}/.rels" + + +def extract_uuid_and_version_from_obj_path(obj_path: Union[str, Path]) -> Tuple[str, Optional[str]]: + """ + Extract the uuid and version of an object from its path in the epc file + :param obj_path: the path of the object file (e.g. "namespace_pkg1.0/version_1.0/ObjType_uuid.xml" or "ObjType_uuid.xml") + :return: a tuple containing the uuid and version of the object + """ + _obj_path = Path(obj_path) if not isinstance(obj_path, Path) else obj_path + + uuid_match = extract_uuid_from_string(str(_obj_path)) + if uuid_match is None: + raise ValueError(f"Cannot extract uuid from object path: {obj_path}") + + # If this data object is versioned, the unique path should contain a directory called 'version_id' (where id is the identifier for the data object version). + version = None + for part in _obj_path.parts: + if part.startswith(PATH_VERSION_PREFIX): + version = part[len(PATH_VERSION_PREFIX) :] + + return uuid_match, version + + +def in_epc_file_path_to_mime_type(path: str) -> Optional[str]: + """Infer MIME type from in-EPC file path""" + if not path: + return None + + # Check for specific EPC file types first + if path.endswith("rels"): + return MimeType.RELS.value + elif path in (gen_core_props_path(), f"/{gen_core_props_path()}"): + return MimeType.CORE_PROPERTIES.value + elif path.startswith((f"/{CORE_PROPERTIES_FOLDER_NAME}/", f"{CORE_PROPERTIES_FOLDER_NAME}/")): + return MimeType.EXTENDED_CORE_PROPERTIES.value + + # Fallback to inferring from file extension + ext = path.split(".")[-1] + return file_extension_to_mime_type(ext) + + +def get_file_folder(path) -> Optional[str]: + """Get the folder path from a given file path.""" + if path is None: + return None + _path = Path(path) if not isinstance(path, Path) else path + return _path.parent.as_posix() if _path.parent != Path(".") else "" + + +def make_path_relative_to_other_file(path: str, ref_path: Optional[Union[str, Path]]) -> str: + # make the relative path absolute regarding to the epc file path + if ref_path is not None: + if isinstance(ref_path, (str, Path)): + epc_folder = get_file_folder(ref_path) or "" + if not os.path.isabs(path): + return os.path.normpath(os.path.join(epc_folder, path)) + else: + return path + else: + return path + + +def make_path_relative_to_filepath_list(paths: List[str], ref_path: Optional[Union[str, Path]] = None) -> List[str]: + return [make_path_relative_to_other_file(path, ref_path) for path in paths] + + +# __ ____________ ______ +# / |/ / _/ ___// ____/ +# / /|_/ // / \__ \/ / +# / / / // / ___/ / /___ +# /_/ /_/___//____/\____/ + + +def as_identifier(identifier: Union[str, Uri, Any]) -> Optional[str]: + if identifier is None: + return None + elif isinstance(identifier, str): + if identifier.startswith("eml:///"): + return as_identifier(parse_uri(identifier)) + if OptimizedRegex.UUID.fullmatch(identifier) is not None: + raise NotEnoughInformationError( + "Simple uuid is not enough to be used as an identifier, please provide a full URI or an object with a valid URI or identifier that contains the version : 'UUID.VERSION' even if VERSION can be an empty string" + ) + else: + return identifier + elif isinstance(identifier, Uri): + return identifier.as_identifier() + elif isinstance(identifier, ResourceMetadata): + return as_identifier(identifier.identifier) + elif hasattr(identifier, "uri"): # EpcObjectMetadata + return as_identifier(identifier.uri) + else: + # Try to get URI from object + obj_uri = get_obj_uri(obj=identifier, dataspace=None) + if obj_uri is not None: + return obj_uri.as_identifier() + return str(identifier) + + +def create_external_relationship(path: str, _id: Optional[str] = None) -> Relationship: + return Relationship( + target=path, + type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), + target_mode=TargetMode.EXTERNAL, + id=_id or f"_ext_{gen_uuid()}", + ) + + +def create_h5_external_relationship(h5_path: str, current_idx: int = 0) -> Relationship: + """ + Create a Relationship object to link an external HDF5 file. + :param h5_path: + :return: + """ + return Relationship( + target=h5_path, + type_value=EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(), + id=f"Hdf5File{current_idx + 1 if current_idx > 0 else ''}", + target_mode=TargetMode.EXTERNAL, + ) + + +def relationships_equal(rel1: Relationship, rel2: Relationship) -> bool: + """ + Compare two Relationship objects for equality based on their target and type_value. + + Two relationships are considered equal if they have: + - The same target (destination path) + - The same type_value (relationship type) + + Note: The id field is NOT compared as it's typically auto-generated and + doesn't affect the semantic meaning of the relationship. + + :param rel1: First Relationship object + :param rel2: Second Relationship object + :return: True if relationships are semantically equal, False otherwise + + Example: + >>> rel1 = Relationship(target="obj.xml", type_value="destinationObject", id="_1") + >>> rel2 = Relationship(target="obj.xml", type_value="destinationObject", id="_2") + >>> relationships_equal(rel1, rel2) # True (different IDs don't matter) + """ + return rel1.target == rel2.target and rel1.type_value == rel2.type_value + + +def create_default_core_properties(creator: Optional[str] = None) -> CoreProperties: + """Create default Core Properties object.""" + return CoreProperties( + created=Created(any_element=epoch_to_date(epoch())), + creator=Creator(any_element=creator or "energyml-utils python module (Geosiris)"), + identifier=Identifier(any_element=f"urn:uuid:{gen_uuid()}"), + version="1.0", + ) + + +def create_default_types() -> Types: + """Create default Types object.""" + return Types( + default=[Default(extension="rels", content_type=str(MimeType.RELS))], + override=[ + Override(content_type=str(MimeType.CORE_PROPERTIES), part_name=gen_core_props_path()), + ], + ) + + +def match_external_proxy_type(obj_or_path_or_type: Union[str, Uri, Any]) -> bool: + """Check if the given object, path or type string matches the pattern of an external proxy reference.""" + if isinstance(obj_or_path_or_type, str): + # for a classname, a filepath or a content-type, we check if it contains "external" and "reference" + obj_or_path_or_type_lw = obj_or_path_or_type.lower() + return "external" in obj_or_path_or_type_lw and "reference" in obj_or_path_or_type_lw + elif isinstance(obj_or_path_or_type, Uri): + return match_external_proxy_type(obj_or_path_or_type.get_qualified_type()) + else: + return match_external_proxy_type(str(type(obj_or_path_or_type))) + + +def get_rels_dor_type(dor_target: Union[str, Uri, Any], in_dor_owner_rels_file: bool) -> str: + """ + Determine the appropriate EPC relationship type for a DOR based on its target and rels file context. + + :param dor_target: The target object/type that the DOR references. Can be a string (qualified type), + a Uri object, or an EnergyML object. Used to determine if it's an external proxy. + :param in_dor_owner_rels_file: Boolean indicating which rels file perspective: + - True: We're in the rels file of the object that OWNS/CONTAINS the DOR + - False: We're in the rels file of the object that is TARGETED by the DOR + :return: The appropriate EPCRelsRelationshipType as a string for the relationship + + The function handles four scenarios: + - External proxy from owner's perspective -> ML_TO_EXTERNAL_PART_PROXY + - External proxy from target's perspective -> EXTERNAL_PART_PROXY_TO_ML + - Regular object from owner's perspective -> DESTINATION_OBJECT + - Regular object from target's perspective -> SOURCE_OBJECT + """ + if match_external_proxy_type(dor_target): + if in_dor_owner_rels_file: + # in the rels file of the Representation that points to the proxy + return str(EPCRelsRelationshipType.ML_TO_EXTERNAL_PART_PROXY) + else: + # in the EpcExternalPartReference rels file + return str(EPCRelsRelationshipType.EXTERNAL_PART_PROXY_TO_ML) + else: + if in_dor_owner_rels_file: + # in the rels file of the object that contains the DOR + return str(EPCRelsRelationshipType.DESTINATION_OBJECT) + else: + # in the DOR target rels file + return str(EPCRelsRelationshipType.SOURCE_OBJECT) + + +# _ __ ___ __ __ _ +# | | / /___ _/ (_)___/ /___ _/ /_(_)___ ____ +# | | / / __ `/ / / __ / __ `/ __/ / __ \/ __ \ +# | |/ / /_/ / / / /_/ / /_/ / /_/ / /_/ / / / / +# |___/\__,_/_/_/\__,_/\__,_/\__/_/\____/_/ /_/ + + +def valdiate_basic_epc_structure(epc: Union[str, Path, zipfile.ZipFile, BytesIO]) -> bool: + should_close = False + if isinstance(epc, (str, Path)): + epc_io = zipfile.ZipFile(epc, "r") + should_close = True + elif isinstance(epc, BytesIO): + epc_io = zipfile.ZipFile(epc, "r") + should_close = True + elif isinstance(epc, zipfile.ZipFile): + epc_io = epc + else: + raise ValueError("The epc parameter must be a string, a Path, a ZipFile or a BytesIO object") + + # Check if the EPC file contains the required files: [Content_Types].xml, _rels/.rels and docProps/core.xml + required_files = { + get_epc_content_type_path(), + gen_core_props_rels_path(), + get_epc_content_type_rels_path(), + gen_core_props_path(), + } + + try: + epc_files = set(epc_io.namelist()) + missing_files = required_files - epc_files + if missing_files: + logging.warning(f"The EPC file is missing the following required files: {missing_files}") + return False + finally: + if should_close: + epc_io.close() + + return True + + +def create_mandatory_structure_epc(epc: Union[str, Path, zipfile.ZipFile, BytesIO]) -> None: + # Create a zip file with the minimal structure of an EPC file, including [Content_Types].xml and _rels/.rels and core properties + should_close = False + if isinstance(epc, (str, Path)): + epc_io = zipfile.ZipFile(epc, "a", zipfile.ZIP_DEFLATED) + should_close = True + elif isinstance(epc, BytesIO): + epc_io = zipfile.ZipFile(epc, "a", zipfile.ZIP_DEFLATED) + should_close = True + elif isinstance(epc, zipfile.ZipFile): + if epc.mode == "r": + raise ValueError("Cannot write to a read-only ZipFile. Open it in 'a' or 'w' mode.") + epc_io = epc + else: + raise ValueError("The epc parameter must be a string, a Path, a ZipFile or a BytesIO object") + + core_props = create_default_core_properties() + empty_epc_structure = { + get_epc_content_type_path(): serialize_xml(Types()), + gen_core_props_rels_path(): serialize_xml(Relationships()), + gen_core_props_path(): serialize_xml(core_props), + get_epc_content_type_rels_path(): serialize_xml( + Relationships( + relationship=[ + Relationship( + id="CoreProperties", + type_value=str(EPCRelsRelationshipType.CORE_PROPERTIES), + target=gen_core_props_path(), + ) + ] + ) + ), + } + + # print(f"Current files in the EPC: {epc_io.namelist()}") + # print(f"Potential created files: {list(empty_epc_structure.keys())}") + try: + for path, content in empty_epc_structure.items(): + if path not in epc_io.namelist(): + epc_io.writestr(path, content) + finally: + if should_close: + epc_io.close() + + +def repair_epc_structure_if_not_valid(epc: Union[str, Path, zipfile.ZipFile, BytesIO]) -> None: + if not valdiate_basic_epc_structure(epc): + logging.warning("EPC structure validation failed. Attempting auto-repair.") + create_mandatory_structure_epc(epc) + + +# ____ __ __ __ _ __ +# / __ \_________ ____ ___ ____/ /___ __ / //_/(_)___ ____/ /____ +# / /_/ / ___/ __ \/ __ \/ _ \/ __ / __ \/ / / ,< / / __ \/ __ / ___/ +# / ____/ / / /_/ / /_/ / __/ /_/ / /_/ / / / /| |/ / / / / /_/ (__ ) +# /_/ /_/ \____/ .___/\___/\__,_/\__, /_/ /_/ |_/_/_/ /_/\__,_/____/ +# /_/ /____/ + +""" +PropertyKind list: a list of Pre-defined properties +""" +__CACHE_PROP_KIND_DICT__ = {} + + +def update_prop_kind_dict_cache(): + """Update the property kind dictionary cache from the standard property kinds file.""" + prop_kind = get_property_kind_dict_path_as_dict() + + for prop in prop_kind["PropertyKind"]: + __CACHE_PROP_KIND_DICT__[prop["Uuid"]] = read_energyml_json_str(json.dumps(prop))[0] + + +def get_property_kind_by_uuid(uuid: str) -> Optional[Any]: + """ + Get a property kind by its uuid. + :param uuid: the uuid of the property kind + :return: the property kind or None if not found + """ + if len(__CACHE_PROP_KIND_DICT__) == 0: + # update the cache to check if it is a + try: + update_prop_kind_dict_cache() + except FileNotFoundError as e: + logging.error(f"Failed to parse propertykind dict {e}") + return __CACHE_PROP_KIND_DICT__.get(uuid, None) + + +def get_property_kind_and_parents(uuids: list) -> Dict[str, Any]: + """Get PropertyKind objects and their parents from a list of UUIDs. + + Args: + uuids (list): List of PropertyKind UUIDs. + + Returns: + Dict[str, Any]: A dictionary mapping UUIDs to PropertyKind objects and their parents. + """ + dict_props: Dict[str, Any] = {} + + for prop_uuid in uuids: + prop = get_property_kind_by_uuid(prop_uuid) + if prop is not None: + dict_props[prop_uuid] = prop + parent_uuid = get_object_attribute(prop, "parent.uuid") + if parent_uuid is not None and parent_uuid not in dict_props: + dict_props = get_property_kind_and_parents([parent_uuid]) | dict_props + else: + logging.warning(f"PropertyKind with UUID {prop_uuid} not found.") + continue + return dict_props + + +# ____ ____ ____ ______ __ _ +# / __ \/ __ \/ __ \ / ____/_______ ____ _/ /_(_)___ ____ +# / / / / / / / /_/ / / / / ___/ _ \/ __ `/ __/ / __ \/ __ \ +# / /_/ / /_/ / _, _/ / /___/ / / __/ /_/ / /_/ / /_/ / / / / +# /_____/\____/_/ |_| \____/_/ \___/\__,_/\__/_/\____/_/ /_/ + + +def as_dor(obj_or_identifier: Union[str, Uri, Any], dor_qualified_type: str = "eml23.DataObjectReference"): + """ + Create a DOR (Data Object Reference) from an object to target the latter. + :param obj_or_identifier: an energyml object, identifier string, or URI + :param dor_qualified_type: the qualified type of the DOR (e.g. "eml23.DataObjectReference" is the default value) + :return: a DOR object + """ + if obj_or_identifier is None: + return None + + cls = get_class_from_qualified_type(dor_qualified_type) + dor = cls() + + # Variables to collect data from different sources + dor_uuid = None + dor_title = None + dor_version = None + dor_qualified_type_str = None + dor_content_type_str = None + dor_energistics_uri = None + + if isinstance(obj_or_identifier, str) or isinstance(obj_or_identifier, Uri): # is an identifier or uri + parsed_uri = obj_or_identifier if isinstance(obj_or_identifier, Uri) else parse_uri(obj_or_identifier) + if parsed_uri is not None: + # From URI + logging.debug(f"====> parsed uri {parsed_uri} : uuid is {parsed_uri.uuid}") + dor_uuid = parsed_uri.uuid + dor_version = parsed_uri.version + dor_qualified_type_str = parsed_uri.get_qualified_type() + dor_content_type_str = qualified_type_to_content_type(parsed_uri.get_qualified_type()) + dor_energistics_uri = str(obj_or_identifier) + elif isinstance(obj_or_identifier, str): # identifier + if len(__CACHE_PROP_KIND_DICT__) == 0: + try: + update_prop_kind_dict_cache() + except FileNotFoundError as e: + logging.error(f"Failed to parse propertykind dict {e}") + try: + uuid, version = split_identifier(obj_or_identifier) + if uuid in __CACHE_PROP_KIND_DICT__: + return as_dor(__CACHE_PROP_KIND_DICT__[uuid], dor_qualified_type) + else: + dor_uuid = uuid + dor_version = version + except AttributeError: + logging.error(f"Failed to parse identifier {obj_or_identifier}. DOR will be empty") + else: + if is_dor(obj_or_identifier): + # DOR conversion + if hasattr(obj_or_identifier, "qualified_type"): + dor_qualified_type_str = get_object_attribute(obj_or_identifier, "qualified_type") + elif hasattr(obj_or_identifier, "content_type"): + dor_qualified_type_str = content_type_to_qualified_type( + get_object_attribute(obj_or_identifier, "content_type") + ) + + if hasattr(obj_or_identifier, "qualified_type"): + dor_content_type_str = qualified_type_to_content_type( + get_object_attribute(obj_or_identifier, "qualified_type") + ) + elif hasattr(obj_or_identifier, "content_type"): + dor_content_type_str = get_object_attribute(obj_or_identifier, "content_type") + + dor_title = get_object_attribute(obj_or_identifier, "Title") + dor_uuid = get_obj_uuid(obj_or_identifier) + dor_version = get_obj_version(obj_or_identifier) + else: + # For etp Resource object + if hasattr(obj_or_identifier, "uri"): + dor = as_dor(obj_or_identifier.uri, dor_qualified_type) + if hasattr(obj_or_identifier, "name") and hasattr(dor, "title"): + setattr(dor, "title", getattr(obj_or_identifier, "name")) + return dor + else: + # Regular EnergyML object + try: + dor_qualified_type_str = get_qualified_type_from_class(obj_or_identifier) + except Exception as e: + logging.error(f"Failed to set qualified_type for DOR {e}") + + try: + dor_content_type_str = get_content_type_from_class(obj_or_identifier) + except Exception as e: + logging.error(f"Failed to set content_type for DOR {e}") + + dor_title = get_object_attribute(obj_or_identifier, "Citation.Title") + dor_uuid = get_obj_uuid(obj_or_identifier) + dor_version = get_obj_version(obj_or_identifier) + + # Unified attribute setting section - applies collected data to DOR + if dor_qualified_type_str and hasattr(dor, "qualified_type"): + dor.qualified_type = dor_qualified_type_str + + if dor_content_type_str and hasattr(dor, "content_type"): + dor.content_type = dor_content_type_str + + if dor_title and hasattr(dor, "title"): + setattr(dor, "title", dor_title) + + if dor_uuid: + if hasattr(dor, "uuid"): + setattr(dor, "uuid", dor_uuid) + if hasattr(dor, "uid"): + setattr(dor, "uid", dor_uuid) + + if dor_version: + if hasattr(dor, "object_version"): + setattr(dor, "object_version", dor_version) + if hasattr(dor, "version_string"): + setattr(dor, "version_string", dor_version) + + if dor_energistics_uri and hasattr(dor, "energistics_uri"): + setattr(dor, "energistics_uri", dor_energistics_uri) + + return dor + + +# ____ __ _ __ ______ __ _ +# / __ \/ /_ (_)__ _____/ /_ / ____/_______ ____ _/ /_(_)___ ____ +# / / / / __ \ / / _ \/ ___/ __/ / / / ___/ _ \/ __ `/ __/ / __ \/ __ \ +# / /_/ / /_/ // / __/ /__/ /_ / /___/ / / __/ /_/ / /_/ / /_/ / / / / +# \____/_.___// /\___/\___/\__/ \____/_/ \___/\__,_/\__/_/\____/_/ /_/ +# /___/ + + +def create_energyml_object( + content_or_qualified_type: str, + citation: Optional[Any] = None, + uuid: Optional[str] = None, +): + """ + Create an energyml object instance depending on the content-type or qualified-type given in parameter. + The SchemaVersion is automatically assigned. + If no citation is given default one will be used. + If no uuid is given, a random uuid will be used. + :param content_or_qualified_type: + :param citation: + :param uuid: + :return: + """ + if citation is None: + citation = { + "title": "New_Object", + "Creation": epoch_to_date(epoch()), + "LastUpdate": epoch_to_date(epoch()), + "Format": "energyml-utils", + "Originator": "energyml-utils python module", + } + cls = get_class_from_qualified_type(content_or_qualified_type) + obj = cls() + cit = get_obj_attribute_class(cls, "citation")() + copy_attributes( + obj_in=citation, + obj_out=cit, + only_existing_attributes=True, + ignore_case=True, + ) + set_attribute_from_path(obj, "citation", cit) + set_attribute_value(obj, "uuid", uuid or gen_uuid()) + set_attribute_value(obj, "SchemaVersion", get_class_pkg_version(obj)) + + return obj + + +def create_external_part_reference( + eml_version: str, + h5_file_path: str, + citation: Optional[Any] = None, + uuid: Optional[str] = None, +): + """ + Create an EpcExternalPartReference depending on the energyml version (should be ["2.0", "2.1", "2.2"]). + The MimeType, ExistenceKind and Filename will be automatically filled. + :param eml_version: + :param h5_file_path: + :param citation: + :param uuid: + :return: + """ + version_flat = OptimizedRegex.DOMAIN_VERSION.findall(eml_version)[0][0].replace(".", "").replace("_", "") + obj = create_energyml_object( + content_or_qualified_type="eml" + version_flat + ".EpcExternalPartReference", + citation=citation, + uuid=uuid, + ) + set_attribute_value(obj, "MimeType", MimeType.HDF5.value) + set_attribute_value(obj, "ExistenceKind", "Actual") + set_attribute_value(obj, "Filename", h5_file_path) + + return obj + + +# ____ __ __ _ __ _ +# / __ \___ / /___ _/ /_(_)___ ____ ___/ /_ (_)___ _____ +# / /_/ / _ \/ / __ `/ __/ / __ \/ __ \/ __ / / / / __ \/ ___/ +# / _, _/ __/ / /_/ / /_/ / /_/ / / / / /_/ / /_/ / /_/ (__ ) +# /_/ |_|\___/_/\__,_/\__/_/\____/_/ /_/\__,_/\__,_/ .___/____/ +# /_/ + + +def get_reverse_dor_list(obj_list: List[Any], key_func: Callable = get_obj_identifier) -> Dict[str, List[Any]]: + """ + Compute a dict with 'OBJ_UUID.OBJ_VERSION' as Key, and list of DOR that reference it. + If the object version is None, key is 'OBJ_UUID.' + :param obj_list: + :param key_func: a callable to create the key of the dict from the object instance + :return: str + """ + rels = {} + for obj in obj_list: + for dor in search_attribute_matching_type(obj, "DataObjectReference", return_self=False): + key = key_func(dor) + if key not in rels: + rels[key] = [] + rels[key] = rels.get(key, []) + [obj] + return rels + + +def get_dor_uris_from_obj(obj: Any) -> Set[Uri]: + """Get uri of all Data Object References (DORs) directly referenced by the given object.""" + uri_set = set() + try: + dor_list = get_direct_dor_list(obj) + for dor in dor_list: + try: + uri = get_obj_uri(dor) + if uri and uri.is_object_uri(): + uri_set.add(uri) + except Exception as e: + logging.warning(f"Failed to extract uri from DOR: {e}") + except Exception as e: + logging.warning(f"Failed to get DOR list from object: {e}") + return uri_set + + +def get_dor_or_external_uris_from_obj(obj: Any) -> Tuple[Set[Uri], Set[Tuple[str, str]]]: + """ + Extract all URIs from Data Object References (DORs) and external data references in an EnergyML object. + + This function performs a comprehensive scan of an EnergyML object to find: + 1. **Data Object References (DORs)**: Internal references to other EnergyML objects within the EPC + (e.g., a TriangulatedSetRepresentation pointing to a HorizonInterpretation) + 2. **External Data References**: References to external data files, typically HDF5 arrays + (e.g., ExternalDataArrayPart.uri for array storage outside the EPC) + + Unlike `get_dor_uris_from_obj()` which only returns DORs, this function captures both internal + object references AND external file references, making it suitable for complete dependency analysis. + + :param obj: Any EnergyML object (e.g., Representation, Property, Interpretation, etc.) + The function will recursively search all attributes matching DOR or external reference patterns. + + :return: A tuple containing: + - A set of URIs for all DORs found (internal references to other EnergyML objects) + - A set of tuples for external references, where each tuple contains (external URI, MIME type) + + :raises: Does not raise exceptions. Logs warnings for any extraction failures and continues processing. + + Example: + >>> from energyml.resqml.v2_2.resqmlv2 import TriangulatedSetRepresentation + >>> trset = load_triangulated_set() # Has DOR to interpretation + external HDF5 arrays + >>> dor_uris, external_uris = get_dor_or_external_uris_from_obj(trset) + >>> for uri in dor_uris: + ... print(f"Internal reference: {uri}") + >>> for ext_uri, mime_type in external_uris: + ... print(f"External file: {ext_uri} (type: {mime_type})") + + ... print(f"Internal reference: {uri}") + ... else: + ... print(f"External file: {uri[0]} (type: {uri[1]})") + Internal reference: eml:///resqml22.HorizonInterpretation(abc-123-def) + External file: my_hdf5_file.h5 (type: application/x-hdf5) + + Note: + - The search pattern matches both 'DataObjectReference' and 'ExternalDataArrayPart' types + - DORs are identified by having 'uid' or 'uuid' attributes + - External references are identified by having 'uri' and optionally 'mime_type' attributes + - For complete relationship analysis including reverse relationships, use EpcRelsCache instead + + See Also: + - `get_dor_uris_from_obj()`: Similar function but only returns internal DOR references + - `get_direct_dor_list()`: Returns the actual DOR objects rather than their URIs + """ + dor_uris = set() + external_uris = set() + try: + dor_list = search_attribute_matching_type(obj, "DataObjectReference|ExternalDataArrayPart") + for dor_or_ext in dor_list: + if hasattr(dor_or_ext, "uid") or hasattr(dor_or_ext, "uuid"): + # DOR case + try: + uri = get_obj_uri(dor_or_ext) + if uri and uri.is_object_uri(): + dor_uris.add(uri) + except Exception as e: + logging.warning(f"Failed to extract uri from DOR: {e}") + else: + # External reference case (e.g. ExternalDataArrayPart) + try: + ext_uri = getattr(dor_or_ext, "uri", None) + ext_mime_type = getattr(dor_or_ext, "mime_type", None) + if ext_uri: + external_uris.add((ext_uri, ext_mime_type)) + except Exception as e: + logging.warning(f"Failed to extract uri from external reference: {e}") + except Exception as e: + logging.warning(f"Failed to get DOR list from object: {e}") + return dor_uris, external_uris + + +# ____ ___ ________ ______ +# / __ \/ |/_ __/ / / / ___/ +# / /_/ / /| | / / / /_/ /\__ \ +# / ____/ ___ |/ / / __ /___/ / +# /_/ /_/ |_/_/ /_/ /_//____/ + + +def get_file_folder_and_name_from_path(path: str) -> Tuple[str, str]: + """ + Returns a tuple (FOLDER_PATH, FILE_NAME) + :param path: + :return: + """ + obj_folder = path[: path.rindex("/") + 1] if "/" in path else "" + obj_file_name = path[path.rindex("/") + 1 :] if "/" in path else path + return obj_folder, obj_file_name diff --git a/energyml-utils/src/energyml/utils/epc_validator.py b/energyml-utils/src/energyml/utils/epc_validator.py new file mode 100644 index 0000000..cae06d3 --- /dev/null +++ b/energyml-utils/src/energyml/utils/epc_validator.py @@ -0,0 +1,618 @@ +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 +""" +EPC (Energistics Packaging Conventions) Validator Module. + +This module provides comprehensive validation for EPC v1.0 files according to +the Energistics Packaging Conventions specification. It validates: +- ZIP container integrity +- Presence and validity of Core Properties +- Content Types XML structure and validity +- Relationships (.rels) consistency +- Compliance with EPC naming conventions and structure +""" + +import logging +import re +import zipfile +from dataclasses import dataclass, field +from io import BytesIO +from pathlib import Path +from typing import Any, Dict, List, Optional, Set, Union + +from energyml.opc.opc import CoreProperties, Override, Relationship, Relationships, Types +from xsdata.formats.dataclass.parsers import XmlParser +from xsdata.exceptions import ParserError + +from energyml.utils.constants import RELS_CONTENT_TYPE, EpcExportVersion +from energyml.utils.exception import ( + ContentTypeValidationError, + CorePropertiesValidationError, + EpcValidationError, + InvalidXmlStructureError, + MissingRequiredFileError, + NamingConventionError, + RelationshipValidationError, + ZipIntegrityError, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class ValidationResult: + """Results from EPC validation. + + Attributes: + is_valid: Whether the EPC file passed validation. + errors: List of critical errors that prevent file from being valid. + warnings: List of non-critical issues that should be reviewed. + info: List of informational messages about the validation. + """ + + is_valid: bool = True + errors: List[str] = field(default_factory=list) + warnings: List[str] = field(default_factory=list) + info: List[str] = field(default_factory=list) + + def add_error(self, message: str) -> None: + """Add an error and mark validation as failed. + + Args: + message: Error message to add. + """ + self.errors.append(message) + self.is_valid = False + + def add_warning(self, message: str) -> None: + """Add a warning message. + + Args: + message: Warning message to add. + """ + self.warnings.append(message) + + def add_info(self, message: str) -> None: + """Add an informational message. + + Args: + message: Info message to add. + """ + self.info.append(message) + + def __str__(self) -> str: + """Return formatted validation result.""" + lines = [f"Validation Result: {'PASSED' if self.is_valid else 'FAILED'}"] + if self.errors: + lines.append(f"\nErrors ({len(self.errors)}):") + for error in self.errors: + lines.append(f" - {error}") + if self.warnings: + lines.append(f"\nWarnings ({len(self.warnings)}):") + for warning in self.warnings: + lines.append(f" - {warning}") + if self.info: + lines.append(f"\nInfo ({len(self.info)}):") + for info in self.info: + lines.append(f" - {info}") + return "\n".join(lines) + + +class EpcParser: + """Parser for EPC file components. + + This class handles parsing of EPC files without performing validation. + It extracts and parses the various components of an EPC file. + """ + + def __init__(self, epc_path: Union[str, Path, BytesIO]): + """Initialize EPC parser. + + Args: + epc_path: Path to EPC file or BytesIO object containing EPC data. + + Raises: + FileNotFoundError: If the specified file doesn't exist. + ZipIntegrityError: If the file is not a valid ZIP archive. + """ + self.epc_path = epc_path + self._zip_file: Optional[zipfile.ZipFile] = None + self._content_types: Optional[Types] = None + self._core_properties: Optional[CoreProperties] = None + self._relationships: Dict[str, Relationships] = {} + self._xml_parser = XmlParser() + + def __enter__(self): + """Context manager entry.""" + self.open() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + self.close() + + def open(self) -> None: + """Open the EPC ZIP file. + + Raises: + FileNotFoundError: If the file doesn't exist. + ZipIntegrityError: If the file is not a valid ZIP archive. + """ + try: + if isinstance(self.epc_path, BytesIO): + self._zip_file = zipfile.ZipFile(self.epc_path, "r") + else: + path = Path(self.epc_path) + if not path.exists(): + raise FileNotFoundError(f"EPC file not found: {self.epc_path}") + self._zip_file = zipfile.ZipFile(path, "r") + except zipfile.BadZipFile as e: + raise ZipIntegrityError(f"Invalid ZIP file: {e}") from e + + def close(self) -> None: + """Close the ZIP file.""" + if self._zip_file: + self._zip_file.close() + self._zip_file = None + + def list_files(self) -> List[str]: + """List all files in the EPC archive. + + Returns: + List of file paths within the archive. + + Raises: + ZipIntegrityError: If ZIP file is not open. + """ + if not self._zip_file: + raise ZipIntegrityError("ZIP file is not open") + return self._zip_file.namelist() + + def read_file(self, path: str) -> bytes: + """Read a file from the EPC archive. + + Args: + path: Path to file within the archive. + + Returns: + File contents as bytes. + + Raises: + MissingRequiredFileError: If the file doesn't exist in the archive. + """ + if not self._zip_file: + raise ZipIntegrityError("ZIP file is not open") + try: + return self._zip_file.read(path) + except KeyError as e: + raise MissingRequiredFileError(f"File not found in archive: {path}") from e + + def parse_content_types(self) -> Types: + """Parse [Content_Types].xml file. + + Returns: + Parsed Types object. + + Raises: + MissingRequiredFileError: If [Content_Types].xml is missing. + InvalidXmlStructureError: If XML is malformed. + """ + if self._content_types is not None: + return self._content_types + + content_types_path = "[Content_Types].xml" + try: + xml_content = self.read_file(content_types_path) + self._content_types = self._xml_parser.from_bytes(xml_content, Types) + return self._content_types + except MissingRequiredFileError: + raise + except (ParserError, Exception) as e: + raise InvalidXmlStructureError( + f"Failed to parse {content_types_path}: {e}", + details={"file": content_types_path}, + ) from e + + def parse_core_properties(self, core_props_path: str = "docProps/core.xml") -> Optional[CoreProperties]: + """Parse core properties XML file. + + Args: + core_props_path: Path to core properties file. + + Returns: + Parsed CoreProperties object or None if file doesn't exist. + + Raises: + InvalidXmlStructureError: If XML is malformed. + """ + if self._core_properties is not None: + return self._core_properties + + try: + xml_content = self.read_file(core_props_path) + self._core_properties = self._xml_parser.from_bytes(xml_content, CoreProperties) + return self._core_properties + except MissingRequiredFileError: + return None + except (ParserError, Exception) as e: + raise InvalidXmlStructureError( + f"Failed to parse {core_props_path}: {e}", + details={"file": core_props_path}, + ) from e + + def parse_relationships(self, rels_path: str) -> Relationships: + """Parse a relationships file. + + Args: + rels_path: Path to .rels file. + + Returns: + Parsed Relationships object. + + Raises: + InvalidXmlStructureError: If XML is malformed. + """ + if rels_path in self._relationships: + return self._relationships[rels_path] + + try: + xml_content = self.read_file(rels_path) + relationships = self._xml_parser.from_bytes(xml_content, Relationships) + self._relationships[rels_path] = relationships + return relationships + except MissingRequiredFileError: + # Return empty relationships if file doesn't exist + return Relationships(relationship=[]) + except (ParserError, Exception) as e: + raise InvalidXmlStructureError( + f"Failed to parse {rels_path}: {e}", + details={"file": rels_path}, + ) from e + + def find_all_rels_files(self) -> List[str]: + """Find all .rels files in the archive. + + Returns: + List of paths to .rels files. + """ + if not self._zip_file: + raise ZipIntegrityError("ZIP file is not open") + return [f for f in self._zip_file.namelist() if f.endswith(".rels")] + + +class EpcValidator: + """Validator for EPC (Energistics Packaging Conventions) files. + + This class provides comprehensive validation of EPC v1.0 files according + to the Energistics Packaging Conventions specification. + + Example: + >>> validator = EpcValidator("my_file.epc") + >>> result = validator.validate() + >>> if result.is_valid: + ... print("EPC file is valid!") + ... else: + ... print("Validation errors:") + ... for error in result.errors: + ... print(f" - {error}") + """ + + # Required EPC files + REQUIRED_FILES = ["[Content_Types].xml", "_rels/.rels"] + + # Core properties content type + CORE_PROPS_CONTENT_TYPE = "application/vnd.openxmlformats-package.core-properties+xml" + + # Correct relationships content type (note: the constant RELS_CONTENT_TYPE in constants.py is incorrect) + RELS_CONTENT_TYPE_CORRECT = "application/vnd.openxmlformats-package.relationships+xml" + + # Valid EPC object content type patterns + ENERGYML_CONTENT_TYPE_PATTERN = re.compile(r"^application/x-(resqml|witsml|prodml)\+xml;version=\d+\.\d+;type=obj_") + + def __init__( + self, + epc_path: Union[str, Path, BytesIO], + strict: bool = True, + check_relationships: bool = True, + ): + """Initialize EPC validator. + + Args: + epc_path: Path to EPC file or BytesIO object. + strict: If True, enforce strict validation rules. + check_relationships: If True, validate relationship consistency. + """ + self.epc_path = epc_path + self.strict = strict + self.check_relationships = check_relationships + self.parser = EpcParser(epc_path) + self.result = ValidationResult() + + def validate(self) -> ValidationResult: + """Perform comprehensive EPC validation. + + Returns: + ValidationResult with validation outcome and any issues found. + """ + try: + with self.parser: + self._validate_zip_integrity() + self._validate_required_files() + self._validate_content_types() + self._validate_core_properties() + + if self.check_relationships: + self._validate_relationships() + + self._validate_naming_conventions() + + if self.result.is_valid: + self.result.add_info("EPC file validation passed successfully") + + except EpcValidationError as e: + self.result.add_error(str(e)) + except Exception as e: + self.result.add_error(f"Unexpected error during validation: {e}") + logger.exception("Unexpected validation error") + + return self.result + + def _validate_zip_integrity(self) -> None: + """Validate ZIP container integrity. + + Raises: + ZipIntegrityError: If ZIP structure is corrupt. + """ + try: + # Test ZIP integrity by attempting to list files + files = self.parser.list_files() + self.result.add_info(f"ZIP container contains {len(files)} files") + + # Check for directory entries + directories = [f for f in files if f.endswith("/")] + if directories: + self.result.add_info(f"Found {len(directories)} directory entries") + + except Exception as e: + raise ZipIntegrityError(f"ZIP integrity check failed: {e}") from e + + def _validate_required_files(self) -> None: + """Validate presence of required EPC files. + + Raises: + MissingRequiredFileError: If required files are missing. + """ + files = self.parser.list_files() + files_lower = {f.lower(): f for f in files} + + for required_file in self.REQUIRED_FILES: + # Case-insensitive check + if required_file.lower() not in files_lower: + raise MissingRequiredFileError( + f"Required file missing: {required_file}", + details={"file": required_file}, + ) + self.result.add_info(f"Found required file: {required_file}") + + def _validate_content_types(self) -> None: + """Validate [Content_Types].xml structure and content. + + Raises: + ContentTypeValidationError: If content types are invalid. + """ + try: + content_types = self.parser.parse_content_types() + + # Validate that .rels extension has correct content type + rels_default = None + if content_types.default: + for default in content_types.default: + if default.extension == "rels": + rels_default = default + if default.content_type != self.RELS_CONTENT_TYPE_CORRECT: + self.result.add_warning( + f"Non-standard content type for .rels files: {default.content_type}. " + f"Expected: {self.RELS_CONTENT_TYPE_CORRECT}" + ) + break + + if not rels_default: + self.result.add_warning("No default content type defined for .rels files") + + # Validate overrides + if content_types.override: + self.result.add_info(f"Found {len(content_types.override)} content type overrides") + + energyml_objects = 0 + core_props_found = False + + for override in content_types.override: + # Check for core properties + if override.content_type == self.CORE_PROPS_CONTENT_TYPE: + core_props_found = True + self.result.add_info(f"Core properties defined at: {override.part_name}") + + # Check for Energyml objects + elif override.content_type and self.ENERGYML_CONTENT_TYPE_PATTERN.match(override.content_type): + energyml_objects += 1 + + # Validate part name format + if override.part_name and not override.part_name.startswith("/"): + if self.strict: + self.result.add_error(f"Part name must start with '/': {override.part_name}") + else: + self.result.add_warning(f"Part name should start with '/': {override.part_name}") + + self.result.add_info(f"Found {energyml_objects} Energyml objects") + + if not core_props_found: + self.result.add_warning("No core properties override found in content types") + + else: + self.result.add_warning("No content type overrides defined") + + except InvalidXmlStructureError: + raise + except Exception as e: + raise ContentTypeValidationError(f"Content types validation failed: {e}") from e + + def _validate_core_properties(self) -> None: + """Validate core properties file. + + Raises: + CorePropertiesValidationError: If core properties are invalid. + """ + try: + # Try different possible paths for core properties + core_props_paths = [ + "docProps/core.xml", + "/docProps/core.xml", + "metadata/core.xml", + "/metadata/core.xml", + ] + + core_props = None + found_path = None + + for path in core_props_paths: + core_props = self.parser.parse_core_properties(path) + if core_props: + found_path = path + break + + if not core_props: + if self.strict: + raise CorePropertiesValidationError("Core properties file not found") + else: + self.result.add_warning("Core properties file not found") + return + + self.result.add_info(f"Found core properties at: {found_path}") + + # Validate core properties content + if hasattr(core_props, "creator") and core_props.creator: + self.result.add_info(f"Creator: {core_props.creator}") + else: + self.result.add_warning("Core properties missing 'creator' field") + + if hasattr(core_props, "created") and core_props.created: + self.result.add_info("Core properties contain creation date") + else: + self.result.add_warning("Core properties missing 'created' field") + + except InvalidXmlStructureError: + raise + except Exception as e: + raise CorePropertiesValidationError(f"Core properties validation failed: {e}") from e + + def _validate_relationships(self) -> None: + """Validate relationships consistency. + + Raises: + RelationshipValidationError: If relationships are invalid. + """ + try: + # Find all .rels files + rels_files = self.parser.find_all_rels_files() + self.result.add_info(f"Found {len(rels_files)} relationship files") + + all_files = set(self.parser.list_files()) + relationship_targets: Set[str] = set() + + for rels_file in rels_files: + try: + relationships = self.parser.parse_relationships(rels_file) + + if not relationships.relationship: + self.result.add_warning(f"Empty relationships file: {rels_file}") + continue + + for rel in relationships.relationship: + # Validate relationship has required attributes + if not rel.id or rel.id.strip() == "": + self.result.add_error(f"Relationship missing or has empty 'Id' in {rels_file}") + + if not rel.type_value: + self.result.add_error(f"Relationship missing 'Type' in {rels_file}") + + if not rel.target: + self.result.add_error(f"Relationship missing 'Target' in {rels_file}") + continue + + # Check if target exists (for internal targets) + if not rel.target.startswith("http"): + # Normalize target path + target = rel.target.lstrip("/") + relationship_targets.add(target) + + if target not in all_files: + # Check with leading slash + target_with_slash = "/" + target + if target_with_slash not in all_files: + self.result.add_error( + f"Relationship target not found: {rel.target} (from {rels_file})" + ) + + except InvalidXmlStructureError as e: + self.result.add_error(f"Invalid relationships file {rels_file}: {e}") + + except Exception as e: + raise RelationshipValidationError(f"Relationships validation failed: {e}") from e + + def _validate_naming_conventions(self) -> None: + """Validate EPC naming conventions. + + Raises: + NamingConventionError: If naming conventions are violated. + """ + try: + files = self.parser.list_files() + + # Check for invalid characters in file names + invalid_chars = ["\\", ":", "*", "?", '"', "<", ">", "|"] + + for file_path in files: + # Check for invalid characters + for char in invalid_chars: + if char in file_path: + self.result.add_error(f"Invalid character '{char}' in file path: {file_path}") + + # Check _rels folder naming + if "_rels" in file_path and not file_path.startswith("_rels/"): + parts = file_path.split("/") + valid_rels = any(i > 0 and parts[i] == "_rels" and parts[i - 1] != "" for i in range(len(parts))) + if not valid_rels and file_path != "_rels/.rels": + self.result.add_warning(f"Unusual _rels folder location: {file_path}") + + # Check .rels file naming + if file_path.endswith(".rels"): + if not file_path.endswith("/.rels"): + # Should be in _rels folder with corresponding source file + if "/_rels/" not in file_path: + self.result.add_warning(f"Relationship file not in _rels folder: {file_path}") + + except Exception as e: + raise NamingConventionError(f"Naming convention validation failed: {e}") from e + + +def validate_epc_file( + epc_path: Union[str, Path, BytesIO], + strict: bool = True, + check_relationships: bool = True, +) -> ValidationResult: + """Convenience function to validate an EPC file. + + Args: + epc_path: Path to EPC file or BytesIO object. + strict: If True, enforce strict validation rules. + check_relationships: If True, validate relationship consistency. + + Returns: + ValidationResult with validation outcome. + + Example: + >>> result = validate_epc_file("my_file.epc") + >>> print(result) + """ + validator = EpcValidator(epc_path, strict=strict, check_relationships=check_relationships) + return validator.validate() diff --git a/energyml-utils/src/energyml/utils/exception.py b/energyml-utils/src/energyml/utils/exception.py index fac041f..a3cfe72 100644 --- a/energyml-utils/src/energyml/utils/exception.py +++ b/energyml-utils/src/energyml/utils/exception.py @@ -46,3 +46,82 @@ class NotSupportedError(Exception): def __init__(self, msg): super().__init__(msg) + + +class NotEnoughInformationError(Exception): + """Exception for not enough information to perform an operation""" + + def __init__(self, msg): + super().__init__(msg) + + +# EPC Validation Exceptions + + +class EpcValidationError(Exception): + """Base exception for EPC validation errors.""" + + def __init__(self, message: str, details: Optional[dict] = None): + """Initialize EPC validation error. + + Args: + message: Error message describing the validation failure. + details: Optional dictionary with additional error context. + """ + super().__init__(message) + self.message = message + self.details = details or {} + + def __str__(self) -> str: + """Return string representation of the error.""" + if self.details: + details_str = ", ".join(f"{k}={v}" for k, v in self.details.items()) + return f"{self.message} ({details_str})" + return self.message + + +class ZipIntegrityError(EpcValidationError): + """Exception raised when ZIP container integrity check fails.""" + + pass + + +class MissingRequiredFileError(EpcValidationError): + """Exception raised when required EPC files are missing.""" + + pass + + +class InvalidXmlStructureError(EpcValidationError): + """Exception raised when XML structure is invalid.""" + + pass + + +class RelationshipValidationError(EpcValidationError): + """Exception raised when relationship validation fails.""" + + pass + + +class NamingConventionError(EpcValidationError): + """Exception raised when naming conventions are violated.""" + + pass + + +class ContentTypeValidationError(EpcValidationError): + """Exception raised when content type validation fails.""" + + pass + + +class CorePropertiesValidationError(EpcValidationError): + """Exception raised when core properties validation fails.""" + + pass + + +class NotUriError(Exception): + def __init__(self, uri: Optional[str] = None): + super().__init__(f"Not a valid URI: {uri}") diff --git a/energyml-utils/src/energyml/utils/introspection.py b/energyml-utils/src/energyml/utils/introspection.py index 00408aa..b2b0412 100644 --- a/energyml-utils/src/energyml/utils/introspection.py +++ b/energyml-utils/src/energyml/utils/introspection.py @@ -1,11 +1,14 @@ # Copyright (c) 2023-2024 Geosiris. # SPDX-License-Identifier: Apache-2.0 +from functools import lru_cache import inspect import json import logging import random import re import sys +import traceback +import operator import typing from dataclasses import Field, field from enum import Enum @@ -13,7 +16,8 @@ from types import ModuleType from typing import Any, List, Optional, Union, Dict, Tuple -from .constants import ( +from energyml.utils.constants import ( + path_parent_attribute, primitives, epoch_to_date, epoch, @@ -24,19 +28,18 @@ path_next_attribute, OptimizedRegex, ) -from .manager import ( +from energyml.utils.manager import ( class_has_parent_with_name, get_class_pkg, get_class_pkg_version, - RELATED_MODULES, get_related_energyml_modules_name, get_sub_classes, get_classes_matching_name, dict_energyml_modules, reshape_version_from_regex_match, ) -from .uri import Uri, parse_uri -from .constants import parse_content_type, ENERGYML_NAMESPACES, parse_qualified_type +from energyml.utils.uri import Uri, parse_uri +from energyml.utils.constants import parse_content_type, ENERGYML_NAMESPACES, parse_qualified_type def is_enum(cls: Union[type, Any]): @@ -50,15 +53,53 @@ def is_enum(cls: Union[type, Any]): return is_enum(type(cls)) -def is_primitive(cls: Union[type, Any]) -> bool: +@lru_cache(maxsize=2048) +def _is_primitive_type(obj_type: type) -> bool: """ - Returns True if :param:`cls` is a primitiv type or extends Enum - :param cls: - :return: bool + Returns True if :param:`obj_type` is a primitive type or extends Enum """ - if isinstance(cls, type): - return cls in primitives or Enum in cls.__bases__ - return is_primitive(type(cls)) + if obj_type in primitives: + return True + try: + return issubclass(obj_type, Enum) + except TypeError: + return False + + +def is_primitive(cls: type) -> bool: + """ + Returns True if :param:`cls` is a primitive type or extends Enum + """ + t = cls if isinstance(cls, type) else type(cls) + return _is_primitive_type(t) + + +@lru_cache(maxsize=None) +def _is_abstract_cls(cls: type) -> bool: + # 1. Gestion du cache pour les instances (on récupère le type) + if not isinstance(cls, type): + return is_abstract(type(cls)) + + # 2. Les primitives ne sont jamais abstraites + if is_primitive(cls): + return False + + # 3. Critère de nom (très commun dans Energyml) + if cls.__name__.startswith("Abstract"): + return True + + # 4. Critère des champs (pour les Dataclasses) + # On vérifie explicitement si c'est une dataclass + fields = getattr(cls, "__dataclass_fields__", None) + has_no_fields = fields is not None and len(fields) == 0 + + # 5. Critère des méthodes + # Ta classe 'Test' a une méthode 'hello', donc len(...) == 1 + methods = get_class_methods(cls) + has_no_methods = len(methods) == 0 + + # Une classe est "abstraite" ici si elle est vide (pas de champs, pas de méthodes) + return has_no_fields and has_no_methods def is_abstract(cls: Union[type, Any]) -> bool: @@ -67,40 +108,76 @@ def is_abstract(cls: Union[type, Any]) -> bool: :param cls: :return: bool """ - if isinstance(cls, type): - return ( - not is_primitive(cls) - and ( - cls.__name__.startswith("Abstract") - or (hasattr(cls, "__dataclass_fields__") and len(cls.__dataclass_fields__)) == 0 - ) - and len(get_class_methods(cls)) == 0 - ) - return is_abstract(type(cls)) + t = cls if isinstance(cls, type) else type(cls) + return _is_abstract_cls(t) def get_module_classes_from_name(mod_name: str) -> List: return get_module_classes(sys.modules[mod_name]) -def get_module_classes(mod: ModuleType) -> List: - return inspect.getmembers(mod, inspect.isclass) +@lru_cache(maxsize=None) +def get_module_metadata_map(module_name: str) -> dict: + """ + Crée un index : {NomMeta: Classe, NomPython: Classe} + pour une recherche instantanée. + """ + mapping = {} + for cls_name, cls in get_module_classes_from_name(module_name): + # On indexe par le nom de classe Python + mapping[cls_name] = cls + + # On indexe par le nom dans Meta (spécifique à Energyml/xsdata) + meta = getattr(cls, "Meta", None) + if meta and hasattr(meta, "name"): + mapping[meta.name] = cls + + return mapping -def find_class_in_module(module_name, class_name): +def find_class_in_module(module_name: str, class_name: str): + # 1. Tentative rapide via sys.modules (O(1)) + mod = sys.modules.get(module_name) + if not mod: + return None + try: - return getattr(sys.modules[module_name], class_name) - except: - for cls_name, cls in get_module_classes_from_name(module_name): - try: - if cls_name == class_name or cls.Meta.name == class_name: - return cls - except Exception: - pass + return getattr(mod, class_name) + except AttributeError: + # 2. Recherche via le mapping Meta pré-calculé (O(1) après premier appel) + mapping = get_module_metadata_map(module_name) + cls = mapping.get(class_name) + + if cls: + return cls + logging.error(f"Not Found : {module_name}; {class_name}") return None +@lru_cache(maxsize=None) +def get_module_classes(mod: Union[ModuleType, str]) -> List[Tuple[str, type]]: + if isinstance(mod, str): + mod = sys.modules.get(mod) + if not mod: + return [] + + mod_name = mod.__name__ + return [ + (name, value) + for name, value in mod.__dict__.items() + if isinstance(value, type) and value.__module__ == mod_name + ] + + +@lru_cache(maxsize=None) +def _get_module_search_index(module_name: str) -> List[Tuple[str, type]]: + """Retourne une liste de tuples (nom_en_minuscule, classe) pour le module.""" + classes = get_module_classes_from_name(module_name) + # On pré-calcule le .lower() pour ne le faire qu'une seule fois par module + return [(cls_name.lower(), cls) for cls_name, cls in classes] + + def search_class_in_module_from_partial_name(module_name: str, class_partial_name: str) -> Optional[List[type]]: """ Search a class in a module using a partial name. @@ -110,27 +187,58 @@ def search_class_in_module_from_partial_name(module_name: str, class_partial_nam """ try: - import_module(module_name) - # module = import_module(module_name) - classes = get_module_classes_from_name(module_name) - matching_classes = [cls for cls_name, cls in classes if class_partial_name.lower() in cls_name.lower()] + import_related_module(module_name) + + # 2. Récupération de l'index pré-calculé (O(1) grâce au cache) + search_index = _get_module_search_index(module_name) + + # 3. Recherche floue + search_term = class_partial_name.lower() + matching_classes = [cls for name_lower, cls in search_index if search_term in name_lower] + return matching_classes - except ImportError as e: - logging.error(f"Module '{module_name}' not found: {e}") + except Exception as e: + logging.error(f"Error searching in module '{module_name}': {e}") return None -def get_class_methods(cls: Union[type, Any]) -> List[str]: +@lru_cache(maxsize=None) +def _get_class_methods(cls: Union[type, Any]) -> List[str]: """ - Returns the list of the methods names for a specific class. - :param cls: - :return: + Return a list of method names defined directly in the given class (not inherited). + + Args: + cls: The class or instance to inspect. + + Returns: + List of method names defined in the class (excluding dunder methods). + + Notes: + - Always works on the type for caching efficiency. + - Uses __dict__ to scan only methods defined in THIS class (not inherited). + If you want inherited methods, use dir(), but __dict__ is ~10x faster for EnergyML classes. + - Only checks if the attribute is a function or routine (more precise than callable(), + which includes the class itself). """ - return [ - func - for func in dir(cls) - if callable(getattr(cls, func)) and not func.startswith("__") and not isinstance(getattr(cls, func), type) - ] + # Always work on the type for cache efficiency + if not isinstance(cls, type): + return _get_class_methods(type(cls)) + + methods = [] + # Use __dict__ to scan only methods defined in THIS class + for name, attr in cls.__dict__.items(): + if name.startswith("__"): + continue + # Only check if it's a function or routine (not just callable) + if inspect.isroutine(attr): + methods.append(name) + + return methods + + +def get_class_methods(cls: Union[type, Any]) -> List[str]: + t = cls if isinstance(cls, type) else type(cls) + return _get_class_methods(t) def get_class_from_name(class_name_and_module: str) -> Optional[type]: @@ -184,23 +292,7 @@ def get_class_from_name(class_name_and_module: str) -> Optional[type]: return None -def get_energyml_module_dev_version(pkg: str, current_version: str): - accessible_modules = dict_energyml_modules() - if not current_version.startswith("v"): - current_version = "v" + current_version - - current_version = current_version.replace("-", "_").replace(".", "_") - res = [] - if pkg in accessible_modules: - # logging.debug("\t", pkg, current_version) - for am_pkg_version in accessible_modules[pkg]: - if am_pkg_version != current_version and am_pkg_version.startswith(current_version): - # logging.debug("\t\t", am_pkg_version) - res.append(get_module_name(pkg, am_pkg_version)) - - return res - - +@lru_cache(maxsize=None) def get_energyml_class_in_related_dev_pkg(cls: type): class_name = cls.__name__ class_pkg = get_class_pkg(cls) @@ -219,6 +311,23 @@ def get_energyml_class_in_related_dev_pkg(cls: type): return res +def get_energyml_module_dev_version(pkg: str, current_version: str): + accessible_modules = dict_energyml_modules() + if not current_version.startswith("v"): + current_version = "v" + current_version + + current_version = current_version.replace("-", "_").replace(".", "_") + res = [] + if pkg in accessible_modules: + # logging.debug("\t", pkg, current_version) + for am_pkg_version in accessible_modules[pkg]: + if am_pkg_version != current_version and am_pkg_version.startswith(current_version): + # logging.debug("\t\t", am_pkg_version) + res.append(get_module_name(pkg, am_pkg_version)) + + return res + + def get_module_name_and_type_from_content_or_qualified_type(cqt: str) -> Tuple[str, str]: """ Return a tuple (module_name, type) from a content-type or qualified-type string. @@ -280,6 +389,9 @@ def get_module_name(domain: str, domain_version: str): ns = ENERGYML_NAMESPACES[domain] if not domain_version.startswith("v"): domain_version = "v" + domain_version + + if "." in domain_version: + domain_version = domain_version.replace(".", "_") return f"energyml.{domain}.{domain_version}.{ns[ns.rindex('/') + 1:]}" @@ -289,21 +401,20 @@ def get_module_name(domain: str, domain_version: str): def import_related_module(energyml_module_name: str) -> None: """ - Import related modules for a specific energyml module. (See. :const:`RELATED_MODULES`) + Import related modules for a specific energyml module. (See. :const:`RELATED_MODULES_MAP`) :param energyml_module_name: :return: """ - for related in RELATED_MODULES: - if energyml_module_name in related: - for m in related: - try: - import_module(m) - except Exception as e: - # Only log once per unique module - if m not in _FAILED_IMPORT_MODULES: - _FAILED_IMPORT_MODULES.add(m) - logging.debug(f"Could not import related module {m}: {e}") - # logging.error(e) + group = get_related_energyml_modules_name(energyml_module_name) + for m in group: + try: + import_module(m) + except Exception as e: + # Only log once per unique module + if m not in _FAILED_IMPORT_MODULES: + _FAILED_IMPORT_MODULES.add(m) + logging.debug(f"Could not import related module {m}: {e}") + # logging.error(e) def list_function_parameters_with_types(func, is_class_function: bool = False) -> Dict[str, Any]: @@ -367,6 +478,9 @@ def get_class_attributes(cls: Union[type, Any]) -> List[str]: def get_class_attribute_type(cls: Union[type, Any], attribute_name: str): + """ + Return the type of an attribute of a class. + """ fields = get_class_fields(cls) try: return fields[attribute_name].type @@ -379,24 +493,26 @@ def get_class_attribute_type(cls: Union[type, Any], attribute_name: str): return None -def get_matching_class_attribute_name( +def get_all_matching_class_attribute_name( cls: Union[type, Any], attribute_name: str, re_flags=re.IGNORECASE, -) -> Optional[str]: +) -> List[str]: """ - From an object and an attribute name, returns the correct attribute name of the class. - Example : "ObjectVersion" --> object_version. - This method doesn't only transform to snake case but search into the obj class attributes (or dict keys) + From an object and an attribute name, returns all the correct attribute names of the class matching with the attribute_name. + Example : "\\w*.Version" --> ["object_version", "ObjectVersion", "obj_version", ...] + This method doesn't only transform to snake case but search into the obj class attributes (or dict keys) """ + matching_names = set() if isinstance(cls, dict): for name in cls.keys(): if snake_case(name) == snake_case(attribute_name): - return name + matching_names.add(name) pattern = re.compile(attribute_name, flags=re_flags) for name in cls.keys(): if pattern.match(name): - return name + matching_names.add(name) + return list(matching_names) else: class_fields = get_class_fields(cls) try: @@ -405,7 +521,7 @@ def get_matching_class_attribute_name( if snake_case(name) == snake_case(attribute_name) or ( hasattr(cf, "metadata") and "name" in cf.metadata and cf.metadata["name"] == attribute_name ): - return name + matching_names.add(name) # search regex after to avoid shadowing perfect match pattern = re.compile(attribute_name, flags=re_flags) @@ -414,11 +530,27 @@ def get_matching_class_attribute_name( if pattern.match(name) or ( hasattr(cf, "metadata") and "name" in cf.metadata and pattern.match(cf.metadata["name"]) ): - return name + matching_names.add(name) except Exception as e: logging.error(f"Failed to get attribute {attribute_name} from class {cls}") logging.error(e) + return list(matching_names) + + +def get_matching_class_attribute_name( + cls: Union[type, Any], + attribute_name: str, + re_flags=re.IGNORECASE, +) -> Optional[str]: + """ + From an object and an attribute name, returns the correct attribute name of the class. + Example : "ObjectVersion" --> object_version. + This method doesn't only transform to snake case but search into the obj class attributes (or dict keys) + """ + matched = get_all_matching_class_attribute_name(cls, attribute_name, re_flags) + if len(matched) > 0: + return matched[0] return None @@ -429,7 +561,7 @@ def get_object_attribute(obj: Any, attr_dot_path: str, force_snake_case=True) -> :param obj: :param attr_dot_path: - :param force_snake_case: + :param force_snake_case: if True, the method will try to find the attribute name in snake case (only for class attribute, not for dict keys nor list index) :return: """ current_attrib_name, path_next = path_next_attribute(attr_dot_path) @@ -438,9 +570,6 @@ def get_object_attribute(obj: Any, attr_dot_path: str, force_snake_case=True) -> logging.error(f"Attribute path '{attr_dot_path}' is invalid.") return None - if force_snake_case: - current_attrib_name = snake_case(current_attrib_name) - value = None if isinstance(obj, list): value = obj[int(current_attrib_name)] @@ -448,6 +577,8 @@ def get_object_attribute(obj: Any, attr_dot_path: str, force_snake_case=True) -> value = obj.get(current_attrib_name, None) else: try: + if force_snake_case: + current_attrib_name = snake_case(current_attrib_name) value = getattr(obj, current_attrib_name) except AttributeError: return None @@ -608,7 +739,8 @@ def get_object_attribute_no_verif(obj: Any, attr_name: str, default: Optional[An else: raise AttributeError(obj, name=attr_name) else: - res = getattr(obj, attr_name) + res = operator.attrgetter(attr_name)(obj) + # res = getattr(obj, attr_name) if res is None: # we did not used the "default" of getattr to keep raising AttributeError return default return res @@ -628,15 +760,39 @@ def get_object_attribute_rgx(obj: Any, attr_dot_path_rgx: str) -> Any: # unescape Dot current_attrib_name = current_attrib_name.replace("\\.", ".") + if isinstance(obj, list): + # current_attrib may be a regex for list index. + # first, test if it's a simple int + # print("TRY INDEX", current_attrib_name, obj) + try: + idx = int(current_attrib_name) + if idx < len(obj) and idx >= 0: + return obj[idx] + else: + raise AttributeError(obj, name=current_attrib_name) + except ValueError: + accumulator = [] + for i in range(len(obj)): + if re.match(current_attrib_name, str(i)): + accumulator.append(obj[i]) + # print("ACCUMULATOR", accumulator) + if accumulator: + if len(attrib_list) > 1: + return [ + get_object_attribute_rgx(v, attr_dot_path_rgx[len(current_attrib_name) + 1 :]) + for v in accumulator + ] + else: + return accumulator + else: + real_attrib_name = get_matching_class_attribute_name(obj, current_attrib_name) + if real_attrib_name is not None: + value = get_object_attribute_no_verif(obj, real_attrib_name) - real_attrib_name = get_matching_class_attribute_name(obj, current_attrib_name) - if real_attrib_name is not None: - value = get_object_attribute_no_verif(obj, real_attrib_name) - - if len(attrib_list) > 1: - return get_object_attribute_rgx(value, attr_dot_path_rgx[len(current_attrib_name) + 1 :]) - else: - return value + if len(attrib_list) > 1: + return get_object_attribute_rgx(value, attr_dot_path_rgx[len(current_attrib_name) + 1 :]) + else: + return value return None @@ -777,7 +933,8 @@ def search_attribute_matching_type_with_path( elif not is_primitive(obj): for att_name in get_class_attributes(obj): res = res + search_attribute_matching_type_with_path( - obj=get_object_attribute_rgx(obj, att_name), + obj=get_object_attribute_no_verif(obj, att_name), + # obj=get_object_attribute_rgx(obj, att_name), type_rgx=type_rgx, re_flags=re_flags, return_self=True, @@ -811,9 +968,10 @@ def search_attribute_in_upper_matching_name( return elt_list if len(current_path) != 0: # obj != root_obj: - upper_path = current_path[: current_path.rindex(".")] + upper_path = path_parent_attribute(current_path) + # upper_path = current_path[: current_path.rindex(".")] # print(f"\t {upper_path} ") - if len(upper_path) > 0: + if upper_path is not None and len(upper_path) > 0: return search_attribute_in_upper_matching_name( obj=get_object_attribute(root_obj, upper_path), name_rgx=name_rgx, @@ -881,7 +1039,7 @@ def search_attribute_matching_name_with_path( :param current_path: :param deep_search: :param search_in_sub_obj: - :return: + :return: a list of tuple (path, value) for each sub attribute with type matching param "name_rgx". The path is a dot-version like ".Citation.Title" """ # while name_rgx.startswith("."): # name_rgx = name_rgx[1:] @@ -893,7 +1051,7 @@ def search_attribute_matching_name_with_path( # next_match = ".".join(attrib_list[1:]) current_match, next_match = path_next_attribute(name_rgx) if current_match is None: - logging.error(f"Attribute name regex '{name_rgx}' is invalid.") + # logging.error(f"Attribute name regex '{name_rgx}' is invalid.") return [] res = [] @@ -923,22 +1081,27 @@ def search_attribute_matching_name_with_path( else: not_match_path_and_obj.append((f"{current_path}{k}", s_o)) elif not is_primitive(obj): - match_value = get_matching_class_attribute_name(obj, current_match.replace("\\.", ".")) - if match_value is not None: + current_match = current_match.replace("\\.", ".") + # logging.debug(f"searching {current_match} in {type(obj)} with path {current_path} and next match {next_match}") + match_values = get_all_matching_class_attribute_name(obj, current_match, re_flags) + for match_value in match_values: + # logging.debug(f"\tmatch found : {match_value}") match_path_and_obj.append( ( f"{current_path}{match_value}", get_object_attribute_no_verif(obj, match_value), ) ) + # logging.debug("f------") for att_name in get_class_attributes(obj): - if att_name != match_value: + if att_name not in match_values: not_match_path_and_obj.append( ( f"{current_path}{att_name}", get_object_attribute_no_verif(obj, att_name), ) ) + # logging.debug(f"\tmatch_path_and_obj: {match_path_and_obj}") for matched_path, matched in match_path_and_obj: if next_match is not None: # next_match is different, match is not final @@ -959,7 +1122,7 @@ def search_attribute_matching_name_with_path( re_flags=re_flags, current_path=matched_path, deep_search=deep_search, # no deep with partial - search_in_sub_obj=True, + search_in_sub_obj=search_in_sub_obj, ) if search_in_sub_obj: for not_matched_path, not_matched in not_match_path_and_obj: @@ -988,8 +1151,8 @@ def search_attribute_matching_name( :param obj: :param name_rgx: :param re_flags: - :param deep_search: - :param search_in_sub_obj: + :param deep_search: if True, the method will search for matching attribute in the sub attributes of a matching attribute (recursive search). If False, only the first level of attributes will be searched for a match. + :param search_in_sub_obj: if True, the method will search for matching attribute in the sub attributes of a non-matching attribute (recursive search). If False, only the first level of attributes will be searched for a match. :return: """ return [ @@ -1129,13 +1292,21 @@ def copy_attributes( # Utility functions -def get_obj_uuid(obj: Any) -> str: +def get_obj_uuid(obj: Any) -> Optional[str]: """ Return the object uuid (attribute must match the following regex : "[Uu]u?id|UUID"). :param obj: :return: """ - return get_object_attribute_rgx(obj, "[Uu]u?id|UUID") + try: + return getattr(obj, "uuid", None) or getattr(obj, "uid") + except AttributeError: + if isinstance(obj, dict): + for k in obj.keys(): + if re.match(r"[Uu]u?id|UUID", k): + return obj[k] + return None + # return get_object_attribute_rgx(obj, "[Uu]u?id|UUID") def get_obj_version(obj: Any) -> Optional[str]: @@ -1145,14 +1316,29 @@ def get_obj_version(obj: Any) -> Optional[str]: :return: """ try: - return get_object_attribute_no_verif(obj, "object_version") + return ( + getattr(obj, "object_version", None) + or getattr(obj, "version_string", None) + or getattr(getattr(obj, "citation"), "version_string", None) + ) except AttributeError: - try: - return get_object_attribute_no_verif(obj, "version_string") - except Exception: - logging.error(f"Error with {type(obj)}") - return None - # raise e + # Log with full call stack to see WHO called this function + # logging.error( + # f"Error getting version for {type(obj)} -- {obj}", + # exc_info=True, + # stack_info=True, # This shows the full call stack including caller + # ) + if isinstance(obj, dict): + for k in obj.keys(): + if re.match(r"object_version|version_string", k, re.IGNORECASE): + return obj[k] + elif re.match(r"citation", k, re.IGNORECASE) and isinstance(obj[k], dict): + for ck in obj[k].keys(): + if re.match(r"version_string", ck, re.IGNORECASE): + return obj[k][ck] + pass + return None + # raise e def get_obj_title(obj: Any) -> Optional[str]: @@ -1162,9 +1348,33 @@ def get_obj_title(obj: Any) -> Optional[str]: :return: """ try: - return get_object_attribute_advanced(obj, "citation.title") + return getattr(getattr(obj, "citation"), "title", None) except AttributeError: - return None + if isinstance(obj, dict): + for k in obj.keys(): + if re.match(r"citation", k, re.IGNORECASE) and isinstance(obj[k], dict): + for ck in obj[k].keys(): + if re.match(r"title", ck, re.IGNORECASE): + return obj[k][ck] + # search for title or name if not classical citation.title found + + for k in obj.keys(): + if re.match(r"title", k, re.IGNORECASE): + return obj[k] + elif re.match(r"name", k, re.IGNORECASE): + return obj[k] + + else: + # DOR : + try: + return getattr(obj, "title") + except AttributeError: + # etp resource meta : + try: + return getattr(obj, "name") + except AttributeError: + pass + return None def get_obj_pkg_pkgv_type_uuid_version( @@ -1423,6 +1633,9 @@ def get_content_type_from_class(cls: Union[type, Any], print_dev_version=True, n def get_object_type_for_file_path_from_class(cls) -> str: + """ + Return the object type to use in file path or content type. It is not always the same as the class name, for example for resqml201, the class "TriangulatedSetRepresentation" has to be written "obj_TriangulatedSetRepresentation" in file path and content type. + """ if not isinstance(cls, type): cls = type(cls) classic_type = get_obj_type(cls) @@ -1668,6 +1881,10 @@ def get_all_possible_instanciable_classes_for_attribute(parent_obj: Any, attribu return [] +def get_enum_values(cls: Any) -> List[str]: + return cls._member_names_ if is_enum(cls) else [] + + def _random_value_from_class( cls: Any, energyml_module_context: List[str], diff --git a/energyml-utils/src/energyml/utils/manager.py b/energyml-utils/src/energyml/utils/manager.py index 10644ad..caafc42 100644 --- a/energyml-utils/src/energyml/utils/manager.py +++ b/energyml-utils/src/energyml/utils/manager.py @@ -1,5 +1,6 @@ # Copyright (c) 2023-2024 Geosiris. # SPDX-License-Identifier: Apache-2.0 +from functools import lru_cache import importlib import inspect import logging @@ -9,7 +10,7 @@ from energyml.utils.constants import ( ENERGYML_MODULES_NAMES, - RELATED_MODULES, + RELATED_MODULES_MAP, RGX_ENERGYML_MODULE_NAME, RGX_PROJECT_VERSION, ) @@ -22,15 +23,16 @@ def get_related_energyml_modules_name(cls: Union[type, Any]) -> List[str]: :param cls: :return: """ - if isinstance(cls, type): - for related in RELATED_MODULES: - if cls.__module__ in related: - return related + if isinstance(cls, str): + return RELATED_MODULES_MAP.get(cls, []) + elif isinstance(cls, type): + return RELATED_MODULES_MAP.get(str(cls.__module__), []) else: return get_related_energyml_modules_name(type(cls)) return [] +@lru_cache(maxsize=None) def dict_energyml_modules() -> Dict: """ List all accessible energyml python modules @@ -182,6 +184,11 @@ def get_class_pkg(cls): return match.group("pkg") # type: ignore except AttributeError as e: logging.error(f"Exception to get class package for '{cls}'") + logging.error( + f"Error getting package for {type(cls)} -- {cls}", + exc_info=True, + stack_info=True, # This shows the full call stack including caller + ) raise e diff --git a/energyml-utils/src/energyml/utils/serialization.py b/energyml-utils/src/energyml/utils/serialization.py index 54a105d..1380099 100644 --- a/energyml-utils/src/energyml/utils/serialization.py +++ b/energyml-utils/src/energyml/utils/serialization.py @@ -20,8 +20,8 @@ from xsdata.formats.dataclass.serializers import XmlSerializer from xsdata.formats.dataclass.serializers.config import SerializerConfig -from .exception import UnknownTypeFromQualifiedType, NotParsableType -from .introspection import ( +from energyml.utils.exception import UnknownTypeFromQualifiedType, NotParsableType +from energyml.utils.introspection import ( as_obj_prefixed_class_if_possible, get_class_from_name, get_energyml_class_in_related_dev_pkg, @@ -35,13 +35,20 @@ get_matching_class_attribute_name, is_enum, ) -from .xml import ( +from energyml.utils.xml_utils import ( get_class_name_from_xml, get_tree, get_xml_encoding, ENERGYML_NAMESPACES, ) +from xsdata.formats.dataclass.parsers.handlers import LxmlEventHandler + +GLOBAL_XML_CONTEXT = XmlContext( + # element_name_generator=text.camel_case, + # attribute_name_generator=text.kebab_case +) + class JSON_VERSION(Enum): XSDATA = "XSDATA" @@ -65,7 +72,7 @@ def _read_energyml_xml_bytes_as_class( fail_on_unknown_attributes=fail_on_unknown_attributes, # process_xinclude=True, ) - parser = XmlParser(config=config) + parser = XmlParser(config=config, context=GLOBAL_XML_CONTEXT, handler=LxmlEventHandler) try: return parser.from_bytes(file, obj_class) except ParserError as e: @@ -81,11 +88,6 @@ def _read_energyml_xml_bytes_as_class( def read_energyml_xml_tree(file: etree, obj_type: Optional[type] = None) -> Any: - # if obj_type is None: - # obj_type = get_class_from_name(get_class_name_from_xml(file)) - # parser = XmlParser(handler=XmlEventHandler) - # # parser = XmlParser(handler=LxmlEventHandler) - # return parser.parse(file, obj_type) return read_energyml_xml_bytes(etree.tostring(file, encoding="utf8")) @@ -155,7 +157,7 @@ def _read_energyml_json_bytes_as_class(file: bytes, json_version: JSON_VERSION, # fail_on_unknown_attributes=False, # process_xinclude=True, ) - parser = JsonParser(config=config) + parser = JsonParser(config=config, context=GLOBAL_XML_CONTEXT) try: return parser.from_bytes(file, obj_class) except ParserError as e: @@ -269,12 +271,8 @@ def serialize_xml(obj, check_obj_prefixed_classes: bool = True) -> str: # logging.debug(f"[1] Serializing object of type {type(obj)}") obj = as_obj_prefixed_class_if_possible(obj) if check_obj_prefixed_classes else obj # logging.debug(f"[2] Serializing object of type {type(obj)}") - context = XmlContext( - # element_name_generator=text.camel_case, - # attribute_name_generator=text.kebab_case - ) serializer_config = SerializerConfig(indent=" ") - serializer = XmlSerializer(context=context, config=serializer_config) + serializer = XmlSerializer(context=GLOBAL_XML_CONTEXT, config=serializer_config) # res = serializer.render(obj) res = serializer.render(obj, ns_map=ENERGYML_NAMESPACES) # logging.debug(f"[3] Serialized XML with meta namespace : {obj.Meta.namespace}: {serialize_json(obj)}") @@ -286,12 +284,8 @@ def serialize_json( ) -> str: obj = as_obj_prefixed_class_if_possible(obj) if check_obj_prefixed_classes else obj if json_version == JSON_VERSION.XSDATA: - context = XmlContext( - # element_name_generator=text.camel_case, - # attribute_name_generator=text.kebab_case - ) serializer_config = SerializerConfig(indent=" ") - serializer = JsonSerializer(context=context, config=serializer_config) + serializer = JsonSerializer(context=GLOBAL_XML_CONTEXT, config=serializer_config) return serializer.render(obj) elif json_version == JSON_VERSION.OSDU_OFFICIAL: return json.dumps(to_json_dict(obj), indent=4, sort_keys=True) diff --git a/energyml-utils/src/energyml/utils/storage_interface.py b/energyml-utils/src/energyml/utils/storage_interface.py index 99a58d1..f2bbcda 100644 --- a/energyml-utils/src/energyml/utils/storage_interface.py +++ b/energyml-utils/src/energyml/utils/storage_interface.py @@ -93,6 +93,42 @@ def identifier(self) -> str: return f"{self.uuid}.{self.version}" return self.uuid + def __str__(self): + return f"{'[' + self.title + '] ' if self.title else ''}{self.uri}" + + +def create_resource_metadata_from_uri( + uri: Uri, + title: Optional[str] = None, + last_changed: Optional[datetime] = None, + custom_data: Optional[Dict[str, Any]] = None, + source_count: Optional[int] = None, + target_count: Optional[int] = None, +) -> ResourceMetadata: + """ + Create ResourceMetadata from an ETP URI. + + Args: + uri: ETP URI (e.g., 'eml:///dataspace('default')/resqml22.TriangulatedSetRepresentation('uuid.version')') + Returns: + ResourceMetadata instance with fields extracted from the URI + """ + if not uri.is_object_uri(): + raise ValueError("URI must be an object URI to create ResourceMetadata") + return ResourceMetadata( + uri=str(uri), + uuid=uri.uuid or "", + title=title or "", + object_type=uri.object_type or "", + content_type=uri.get_content_type(), + version=uri.version, + dataspace=uri.dataspace, + custom_data=custom_data or {}, + source_count=source_count, + target_count=target_count, + last_changed=last_changed, + ) + @dataclass class DataArrayMetadata: @@ -100,17 +136,30 @@ class DataArrayMetadata: Metadata for a data array in an energyml object. This provides information about arrays stored in HDF5 or other external storage, - similar to ETP DataArrayMetadata. + similar to ETP DataArrayMetadata. Supports RESQML v2.2 ExternalDataArrayPart attributes. + + The dimensions field represents the shape of the array that would be returned: + - For full arrays: the complete array dimensions from the external file + - For sub-selections: the size of the selected portion (determined by start_indices + counts) """ path_in_resource: Optional[str] - """Path to the array within the HDF5 file""" + """Path to the array within the HDF5 file (PathInExternalFile)""" array_type: str """Data type of the array (e.g., 'double', 'int', 'string')""" dimensions: List[int] - """Array dimensions/shape""" + """Array dimensions/shape. For sub-selections, this reflects the selected portion size.""" + + start_indices: Optional[List[int]] = None + """Start index for each dimension (RESQML v2.2 StartIndex). If None, starts at 0.""" + + external_uri: Optional[str] = None + """URI where the DataArrayPart is stored (RESQML v2.2 URI). Can override default file path.""" + + mime_type: Optional[str] = None + """MIME type of the external file (RESQML v2.2 MimeType)""" custom_data: Dict[str, Any] = field(default_factory=dict) """Additional custom metadata""" @@ -203,6 +252,18 @@ def get_object_by_uuid(self, uuid: str) -> List[Any]: """ pass + def get_object_by_uuid_versioned(self, uuid: str, version: Optional[str] = None) -> Optional[Any]: + """ + Retrieve a specific version of an object by UUID and optional version. + + Args: + uuid: Object UUID + version: Optional version string. If None, returns the latest version. + Returns: + The deserialized energyml object matching the UUID and version, or None if not found + """ + return self.get_object(f"{uuid}.{version}" if version else f"{uuid}.") + @abstractmethod def put_object(self, obj: Any, dataspace: Optional[str] = None) -> Optional[str]: """ @@ -231,33 +292,84 @@ def delete_object(self, identifier: Union[str, Uri]) -> bool: pass @abstractmethod - def read_array(self, proxy: Union[str, Uri, Any], path_in_external: str) -> Optional[np.ndarray]: + def read_array( + self, + proxy: Union[str, Uri, Any], + path_in_external: str, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + external_uri: Optional[str] = None, + ) -> Optional[np.ndarray]: """ - Read a data array from external storage (HDF5). + Read a data array from external storage (HDF5) with optional sub-selection. Args: proxy: The object identifier/URI or the object itself that references the array path_in_external: Path within the HDF5 file (e.g., 'values/0') + start_indices: Optional start index for each dimension (RESQML v2.2 StartIndex) + counts: Optional count of elements for each dimension (RESQML v2.2 Count) + external_uri: Optional URI to override default file path (RESQML v2.2 URI) Returns: - The data array as a numpy array, or None if not found + The data array as a numpy array, or None if not found. + If start_indices and counts are provided, returns the sub-selected portion. """ pass + def read_array_view( + self, + proxy: Union[str, Uri, Any], + path_in_external: str, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, + external_uri: Optional[str] = None, + ) -> Optional[np.ndarray]: + """ + Read a data array as a zero-copy view when possible. + + For HDF5 datasets that are contiguous and uncompressed, returns a numpy array + backed directly by the memory-mapped file buffer (no copy). For chunked or + compressed datasets it transparently falls back to a copy, identical to + :meth:`read_array`. + + The caller **must not mutate** the returned array; use ``arr.copy()`` first if + in-place modification is required. + + Default implementation delegates to :meth:`read_array` so that any third-party + subclass that does not override this method retains correct behaviour. + + Args: + proxy: The object identifier/URI or the object itself that references the array + path_in_external: Path within the HDF5 file (e.g., 'values/0') + start_indices: Optional start index for each dimension (RESQML v2.2 StartIndex) + counts: Optional count of elements for each dimension (RESQML v2.2 Count) + external_uri: Optional URI to override default file path (RESQML v2.2 URI) + + Returns: + The data array as a numpy array (view if possible, copy otherwise), or None if not found. + """ + return self.read_array(proxy, path_in_external, start_indices, counts, external_uri) + @abstractmethod def write_array( self, proxy: Union[str, Uri, Any], path_in_external: str, array: np.ndarray, + start_indices: Optional[List[int]] = None, + external_uri: Optional[str] = None, + **kwargs, ) -> bool: """ - Write a data array to external storage (HDF5). + Write a data array to external storage (HDF5) with optional offset. Args: proxy: The object identifier/URI or the object itself that references the array path_in_external: Path within the HDF5 file (e.g., 'values/0') array: The numpy array to write + start_indices: Optional start index for each dimension for partial writes + external_uri: Optional URI to override default file path (RESQML v2.2 URI) + **kwargs: Additional format-specific parameters Returns: True if successfully written, False otherwise @@ -266,18 +378,25 @@ def write_array( @abstractmethod def get_array_metadata( - self, proxy: Union[str, Uri, Any], path_in_external: Optional[str] = None + self, + proxy: Union[str, Uri, Any], + path_in_external: Optional[str] = None, + start_indices: Optional[List[int]] = None, + counts: Optional[List[int]] = None, ) -> Union[DataArrayMetadata, List[DataArrayMetadata], None]: """ - Get metadata for data array(s). + Get metadata for data array(s) with optional sub-selection. Args: proxy: The object identifier/URI or the object itself that references the array path_in_external: Optional specific path. If None, returns all array metadata for the object + start_indices: Optional start index for each dimension (RESQML v2.2 StartIndex) + counts: Optional count of elements for each dimension (RESQML v2.2 Count). + When provided, the returned dimensions will reflect the sub-selected size. Returns: DataArrayMetadata if path specified, List[DataArrayMetadata] if no path, - or None if not found + or None if not found. The dimensions field reflects sub-selection when counts provided. """ pass diff --git a/energyml-utils/src/energyml/utils/uri.py b/energyml-utils/src/energyml/utils/uri.py index da05b1d..ffa8689 100644 --- a/energyml-utils/src/energyml/utils/uri.py +++ b/energyml-utils/src/energyml/utils/uri.py @@ -1,8 +1,14 @@ # Copyright (c) 2023-2024 Geosiris. # SPDX-License-Identifier: Apache-2.0 +import re from typing import Optional from dataclasses import dataclass, field -from .constants import ( + +from energyml.utils.exception import ContentTypeValidationError, NotUriError +from energyml.utils.constants import ( + RGX_CT_ENERGYML_DOMAIN, + RGX_CT_TOKEN_TYPE, + RGX_CT_TOKEN_VERSION, URI_RGX_GRP_DATASPACE, URI_RGX_GRP_DOMAIN, URI_RGX_GRP_DOMAIN_VERSION, @@ -15,6 +21,7 @@ URI_RGX_GRP_COLLECTION_TYPE, URI_RGX_GRP_QUERY, OptimizedRegex, + parse_content_or_qualified_type, ) @@ -39,7 +46,7 @@ class Uri: query: Optional[str] = field(default=None) @classmethod - def parse(cls, uri: str): + def parse(cls, uri: str) -> "Uri": m = OptimizedRegex.URI.match(uri) if m is not None: res = Uri() @@ -57,7 +64,7 @@ def parse(cls, uri: str): res.query = m.group(URI_RGX_GRP_QUERY) return res else: - return None + raise NotUriError(uri) def is_dataspace_uri(self): return ( @@ -75,12 +82,21 @@ def is_object_uri(self): and self.uuid is not None ) - def get_qualified_type(self): + def get_qualified_type(self) -> str: + if self.domain is None or self.domain_version is None or self.object_type is None: + raise ValueError("The URI must have a domain, domain version and object type to get the qualified type") return f"{self.domain}{self.domain_version}.{self.object_type}" - def as_identifier(self): + def get_content_type(self) -> str: + if self.domain is None or self.domain_version is None or self.object_type is None: + raise ValueError("The URI must have a domain, domain version and object type to get the content type") + # Format version with dots + formatted_version = re.sub(r"(\d)(\d)", r"\1.\2", self.domain_version) + return f"application/x-{self.domain}+xml;" f"version={formatted_version};" f"type={self.object_type}" + + def as_identifier(self) -> str: if not self.is_object_uri(): - return None + raise ValueError("Only object URIs can be converted to identifiers") return f"{self.uuid}.{self.version if self.version is not None else ''}" def __str__(self): @@ -108,8 +124,47 @@ def __str__(self): return res + def __hash__(self) -> int: + return hash(str(self)) -def parse_uri(uri: str) -> Optional[Uri]: + +def parse_uri_raise_if_failed(uri: str) -> Uri: if uri is None or len(uri) <= 0: - return None + raise NotUriError(uri) return Uri.parse(uri.strip()) + + +def parse_uri(uri: str) -> Optional[Uri]: + try: + return parse_uri_raise_if_failed(uri) + except NotUriError: + return None + + +def create_uri_from_content_type_or_qualified_type(ct_or_qt: str, uuid: str, version: Optional[str] = None) -> Uri: + """Create a URI from a content type or a qualified type and a uuid (and optionally an object version) + :param ct_or_qt: the content type or qualified type to create the URI from + :param uuid: the uuid of the object + :param version: the version of the object (optional) + :return: the created URI + """ + if ct_or_qt is None or len(ct_or_qt) <= 0: + raise ContentTypeValidationError("Content type or qualified type cannot be null or empty") + if uuid is None or len(uuid) <= 0: + raise ValueError("UUID cannot be null or empty") + m = parse_content_or_qualified_type(ct_or_qt) + if m is not None: + try: + domain = m.group("domain") + domain_version = m.group("domainVersion") + # ensure domaine version has no dots and is in the format of digits only + formatted_version = re.sub(r"(\d)[^\d]+(\d)", r"\1\2", domain_version) + object_type = m.group("type") + return Uri( + domain=domain, domain_version=formatted_version, object_type=object_type, uuid=uuid, version=version + ) + except Exception as e: + raise ContentTypeValidationError( + f"Failed to parse content type or qualified type: {ct_or_qt} -- {m}" + ) from e + raise NotUriError(f"Unable to parse content type: {ct_or_qt}") diff --git a/energyml-utils/src/energyml/utils/validation.py b/energyml-utils/src/energyml/utils/validation.py index 6420573..4ea509c 100644 --- a/energyml-utils/src/energyml/utils/validation.py +++ b/energyml-utils/src/energyml/utils/validation.py @@ -1,17 +1,20 @@ # Copyright (c) 2023-2024 Geosiris. # SPDX-License-Identifier: Apache-2.0 +import logging import re from dataclasses import dataclass, field, Field from enum import Enum import traceback from typing import Any, Dict, List, Optional, Union -from .epc import ( - get_obj_identifier, +from energyml.utils.epc import ( Epc, +) +from energyml.utils.epc_utils import ( + get_obj_identifier, get_property_kind_by_uuid, ) -from .introspection import ( +from energyml.utils.introspection import ( get_class_fields, get_object_attribute, is_primitive, @@ -34,6 +37,9 @@ class ErrorType(Enum): INFO = "info" WARNING = "warning" + def __str__(self): + return self.value + @dataclass class ValidationError: @@ -49,6 +55,7 @@ def toJson(self): return { "msg": self.msg, "error_type": self.error_type.value, + "err_class": self.__class__.__name__, } @property @@ -89,7 +96,6 @@ def __str__(self): @dataclass class MandatoryError(ValidationObjectError): - @property def msg(self) -> str: return f"Mandatory value is None for {get_obj_identifier(self.target_obj)} : '{self.attribute_dot_path}'" @@ -201,6 +207,8 @@ def dor_validation_object( _msg=f"[DOR ERR] has wrong information. Unknown object with uuid '{dor_uuid}'", ) ) + + object_version_list_failed = False if target_uuid is not None and target_identifier is None: accessible_version = [get_obj_version(ref_obj) for ref_obj in dict_obj_uuid[dor_uuid]] errs.append( @@ -212,6 +220,7 @@ def dor_validation_object( f"Version must be one of {accessible_version}", ) ) + object_version_list_failed = True if target_prop is not None and target_uuid is None: errs.append( @@ -224,6 +233,32 @@ def dor_validation_object( ) target = target_identifier or target_uuid or target_prop + + # debug + if isinstance(target, list): + # logging.error( + # f"Multiple objects found with uuid '{dor_uuid}' for DOR in object '{get_obj_identifier(obj)}' at path '{dor_path}'. This should not happen and can lead to wrong validation results.", + # exc_info=True, + # stack_info=True, # This shows the full call stack including caller + # ) + # logging.error( + # f'\t{target} => Object ct and qt {get_object_attribute_rgx(dor, "content_type")} : {get_object_attribute_rgx(dor, "qualified_type")}' + # ) + if len(target) == 0: + target = None + else: + if len(target) > 1: + errs.append( + ValidationObjectError( + error_type=ErrorType.WARNING, + target_obj=obj, + attribute_dot_path=dor_path, + _msg=f"[DOR ERR] Multiple objects found with uuid '{dor_uuid}' for DOR in object '{get_obj_identifier(obj)}' at path '{dor_path}'. This should not happen and can lead to wrong validation results.", + ) + ) + target = target[0] + + # ==== if target is not None: # target = dict_obj_identifier[dor_target_id] target_title = get_object_attribute_rgx(target, "citation.title") @@ -265,7 +300,8 @@ def dor_validation_object( ) ) - if target_version != dor_version: + if not object_version_list_failed and target_version != dor_version: + # checking object_version_list_failed to avoid multiple version errors errs.append( ValidationObjectError( error_type=ErrorType.WARNING, diff --git a/energyml-utils/src/energyml/utils/xml.py b/energyml-utils/src/energyml/utils/xml_utils.py similarity index 97% rename from energyml-utils/src/energyml/utils/xml.py rename to energyml-utils/src/energyml/utils/xml_utils.py index 94e02ee..05e88a1 100644 --- a/energyml-utils/src/energyml/utils/xml.py +++ b/energyml-utils/src/energyml/utils/xml_utils.py @@ -7,7 +7,7 @@ from lxml import etree as ETREE # type: Any -from .constants import ENERGYML_NAMESPACES, ENERGYML_NAMESPACES_PACKAGE, OptimizedRegex, parse_content_type +from energyml.utils.constants import ENERGYML_NAMESPACES, ENERGYML_NAMESPACES_PACKAGE, OptimizedRegex, parse_content_type def get_pkg_from_namespace(namespace: str) -> Optional[str]: diff --git a/energyml-utils/tests/test_array_handlers.py b/energyml-utils/tests/test_array_handlers.py new file mode 100644 index 0000000..3c99588 --- /dev/null +++ b/energyml-utils/tests/test_array_handlers.py @@ -0,0 +1,147 @@ +import os +import tempfile +import numpy as np +import pytest + +from energyml.utils.data.datasets_io import ( + HDF5ArrayHandler, + ParquetArrayHandler, + CSVArrayHandler, + LASArrayHandler, + SEGYArrayHandler, + get_handler_registry, +) + + +def is_h5py_file_closed(h5file): + """Check if an h5py file handle is closed.""" + try: + return not getattr(h5file, "id", None) or not h5file.id.valid + except Exception: + return True + + +def test_default_handler_from_registry_is_h5(): + """Test that the default handler for .h5 is HDF5ArrayHandler.""" + handler = get_handler_registry().get_handler_for_file("") # no extension, should return default .h5 handler + assert isinstance(handler, HDF5ArrayHandler), "Default handler for .h5 should be HDF5ArrayHandler" + + +def test_default_dat_handler_from_registry_is_h5(): + """Test that the default handler for .h5 is HDF5ArrayHandler.""" + handler = get_handler_registry().get_handler_for_file(".dat") # no extension, should return default .h5 handler + assert isinstance(handler, HDF5ArrayHandler), "Default handler for .h5 should be HDF5ArrayHandler" + + +def test_hdf5_array_handler_read_write(): + """Test HDF5ArrayHandler read/write and file closure.""" + arr = np.arange(6).reshape(2, 3) + handler = HDF5ArrayHandler() + with tempfile.NamedTemporaryFile(suffix=".h5", delete=False) as tmp: + fname = tmp.name + try: + # Write + assert handler.write_array(fname, arr, "/data"), "HDF5 write failed" + # Read + out = handler.read_array(fname, "/data") + np.testing.assert_array_equal(arr, out) + # Check file closed after handler deletion + f = handler.file_cache.get_or_open(fname, handler, "r") + del handler + import gc + + gc.collect() + assert is_h5py_file_closed(f), "HDF5 file not closed after handler deletion" + finally: + try: + os.remove(fname) + except Exception: + pass + + +@pytest.mark.skip(reason="Requires 'parquet' extra: pip install energyml-utils[parquet]") +def test_parquet_array_handler_read_write(): + """Test ParquetArrayHandler read/write.""" + arr = np.arange(6).reshape(2, 3) + handler = ParquetArrayHandler() + with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp: + fname = tmp.name + try: + assert handler.write_array(fname, arr, column_titles=["a", "b", "c"]), "Parquet write failed" + out = handler.read_array(fname) + np.testing.assert_array_equal(arr, out) + finally: + try: + os.remove(fname) + except Exception: + pass + + +def test_csv_array_handler_read_write(): + """Test CSVArrayHandler read/write.""" + arr = np.arange(6).reshape(2, 3) + handler = CSVArrayHandler() + with tempfile.NamedTemporaryFile(suffix=".csv", delete=False, mode="w+") as tmp: + fname = tmp.name + try: + assert handler.write_array(fname, arr), "CSV write failed" + out = handler.read_array(fname) + # CSV may return strings, so cast to int + np.testing.assert_array_equal(np.array(out, dtype=int), arr) + finally: + try: + os.remove(fname) + except Exception: + pass + + +def test_las_array_handler_read_write(): + """Test LASArrayHandler read/write if supported.""" + arr = np.arange(6).reshape(2, 3) + handler = LASArrayHandler() + with tempfile.NamedTemporaryFile(suffix=".las", delete=False, mode="w+") as tmp: + fname = tmp.name + try: + write_ok = False + try: + handler.write_array(fname, arr) + write_ok = True + except Exception as e: + print(f"LAS write not supported: {e}") + try: + out = handler.read_array(fname) + if write_ok and out is not None: + np.testing.assert_array_equal(np.array(out, dtype=arr.dtype), arr) + except Exception as e: + print(f"LAS read not supported: {e}") + finally: + try: + os.remove(fname) + except Exception: + pass + + +def test_segy_array_handler_read_write(): + """Test SEGYArrayHandler read/write if supported.""" + arr = np.arange(6).reshape(2, 3) + handler = SEGYArrayHandler() + with tempfile.NamedTemporaryFile(suffix=".sgy", delete=False, mode="w+b") as tmp: + fname = tmp.name + try: + write_ok = False + try: + handler.write_array(fname, arr) + write_ok = True + except Exception as e: + print(f"SEGY write not supported: {e}") + try: + out = handler.read_array(fname) + if write_ok and out is not None: + np.testing.assert_array_equal(np.array(out, dtype=arr.dtype), arr) + except Exception as e: + print(f"SEGY read not supported: {e}") + finally: + try: + os.remove(fname) + except Exception: + pass diff --git a/energyml-utils/tests/test_crs_info.py b/energyml-utils/tests/test_crs_info.py new file mode 100644 index 0000000..449ded6 --- /dev/null +++ b/energyml-utils/tests/test_crs_info.py @@ -0,0 +1,801 @@ +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 +""" +Integration tests for :mod:`energyml.utils.data.crs`. + +Real energyml objects are loaded from the EPC test fixtures shipped in +``rc/epc/``. No mock dataclasses — the installed ``energyml-resqml2*`` and +``energyml-eml*`` packages provide the actual xsdata-generated classes. + +EPC fixtures +───────────── +* ``testingPackageCpp.epc`` — RESQML v2.0.1 (also contains mixed v2.3 CRS) +* ``testingPackageCpp22.epc`` — RESQML v2.2 / EML v2.3 + +Both files are committed to ``rc/epc/`` and are available in CI once the +parent workspace dev-dependencies are installed. +""" +from __future__ import annotations + +import math +from pathlib import Path +from typing import Any, Optional + +import pytest + +from energyml.utils.data.crs import CrsInfo, _uom_to_str, extract_crs_info +from energyml.utils.data.helper import ( + get_crs_offsets_and_angle, + get_projected_epsg_code, + get_projected_uom, + get_vertical_epsg_code, + is_z_reversed, +) +from energyml.utils.epc import Epc +from energyml.utils.introspection import get_obj_uuid, get_object_attribute_rgx + +# --------------------------------------------------------------------------- +# EPC file paths +# --------------------------------------------------------------------------- + +_RC = Path(__file__).parent.parent / "rc" / "epc" +EPC20_PATH = _RC / "testingPackageCpp.epc" +EPC22_PATH = _RC / "testingPackageCpp22.epc" + + +# --------------------------------------------------------------------------- +# Session-scoped fixtures (EPC loaded once per test session) +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def epc20() -> Epc: + if not EPC20_PATH.exists(): + pytest.skip(f"EPC fixture not found: {EPC20_PATH}") + return Epc.read_file(str(EPC20_PATH)) + + +@pytest.fixture(scope="session") +def epc22() -> Epc: + if not EPC22_PATH.exists(): + pytest.skip(f"EPC fixture not found: {EPC22_PATH}") + return Epc.read_file(str(EPC22_PATH)) + + +# --------------------------------------------------------------------------- +# Shared helper — walk representation → CRS DOR → resolved object +# --------------------------------------------------------------------------- + + +def _resolve_crs_from_grid(grid_obj: Any, epc: Epc) -> Optional[Any]: + """Resolve the CRS object linked from a Grid2DRepresentation.""" + # v2.0.1: local_crs sits on the geometry patch + dor = get_object_attribute_rgx(grid_obj, "[Ll]ocal[_]?[Cc]rs") + if dor is None: + dor = get_object_attribute_rgx( + grid_obj, + "[Gg]rid2[Dd][Pp]atch.[Gg]eometry.[Ll]ocal[_]?[Cc]rs", + ) + if dor is None: + return None + uuid = get_obj_uuid(dor) + candidates = epc.get_object_by_uuid(uuid) if uuid else [] + return candidates[0] if candidates else None + + +# =========================================================================== +# DTO and pure-function tests (no EPC required) +# =========================================================================== + + +class TestCrsInfoDto: + def test_defaults(self): + info = CrsInfo() + assert info.x_offset == 0.0 + assert info.y_offset == 0.0 + assert info.z_offset == 0.0 + assert info.projected_epsg_code is None + assert info.projected_uom is None + assert info.vertical_epsg_code is None + assert info.vertical_uom is None + assert info.z_increasing_downward is False + assert info.areal_rotation_value == 0.0 + assert info.areal_rotation_uom == "rad" + assert info.azimuth_reference is None + assert info.source_type is None + + def test_areal_rotation_rad_already_radians(self): + info = CrsInfo(areal_rotation_value=1.5708, areal_rotation_uom="rad") + assert info.areal_rotation_rad() == pytest.approx(1.5708) + + def test_areal_rotation_rad_degrees(self): + info = CrsInfo(areal_rotation_value=90.0, areal_rotation_uom="degr") + assert info.areal_rotation_rad() == pytest.approx(math.pi / 2) + + def test_areal_rotation_rad_zero(self): + assert CrsInfo(areal_rotation_value=0.0, areal_rotation_uom="degr").areal_rotation_rad() == 0.0 + + def test_as_transform_args(self): + info = CrsInfo( + x_offset=100.0, + y_offset=200.0, + z_offset=-50.0, + areal_rotation_value=0.5, + areal_rotation_uom="rad", + z_increasing_downward=True, + ) + kwargs = info.as_transform_args() + assert kwargs["x_offset"] == 100.0 + assert kwargs["y_offset"] == 200.0 + assert kwargs["z_offset"] == -50.0 + assert kwargs["areal_rotation"] == 0.5 + assert kwargs["rotation_uom"] == "rad" + assert kwargs["z_is_up"] is True # z_increasing_downward=True → z_is_up=True (negate to z-up output) + + def test_none_returns_default(self): + info = extract_crs_info(None) + assert info.x_offset == 0.0 + assert info.z_increasing_downward is False + assert info.source_type is None + + +class TestUomToStr: + def test_plain_string(self): + assert _uom_to_str("m") == "m" + + def test_enum_like_with_dot(self): + assert _uom_to_str("LengthUom.ft") == "ft" + + def test_none_returns_none(self): + assert _uom_to_str(None) is None + + def test_empty_after_split_returns_none(self): + assert _uom_to_str("") is None + + +# =========================================================================== +# RESQML v2.0.1 — testingPackageCpp.epc +# =========================================================================== + + +class TestV201LocalTime3DCrs: + """ + LocalTime3DCrs uuid=dbd637d5-4528-4145-908b-5f7136824f6d + xoffset=1.0 yoffset=0.1 zoffset=15.0 projected_uom=M z_down=True + + ZIncreasingDownward=true in the raw file. The linked VerticalCrs is an + inline ``VerticalUnknownCrs`` placeholder that carries no direction field, + so the sentinel (None) correctly leaves the top-level value unchanged. + """ + + UUID = "dbd637d5-4528-4145-908b-5f7136824f6d" + + @pytest.fixture(scope="class") + def info(self, epc20): + obj = epc20.get_object_by_uuid(self.UUID)[0] + return extract_crs_info(obj) + + def test_source_type(self, info): + assert "LocalTime3DCrs" in info.source_type + + def test_offsets(self, info): + assert info.x_offset == pytest.approx(1.0) + assert info.y_offset == pytest.approx(0.1) + assert info.z_offset == pytest.approx(15.0) + + def test_projected_uom(self, info): + # xsdata v2.0.1 enum → _uom_to_str keeps the enum member name casing + assert info.projected_uom is not None + assert info.projected_uom.lower() == "m" + + def test_vertical_uom(self, info): + assert info.vertical_uom is not None + assert info.vertical_uom.lower() == "m" + + def test_z_increasing_downward(self, info): + # ZIncreasingDownward=true in the raw file; VerticalUnknownCrs has no + # direction field so the sentinel leaves the parent value unchanged. + assert info.z_increasing_downward is True + + def test_no_epsg(self, info): + assert info.projected_epsg_code is None + assert info.vertical_epsg_code is None + + def test_rotation_zero(self, info): + assert info.areal_rotation_value == pytest.approx(0.0) + + +class TestV201LocalDepth3DCrs: + """ + LocalDepth3DCrs uuid=0ae56ef3-fc79-405b-8deb-6942e0f2e77c + projected_epsg=23031 projected_uom=M z_down=True offsets all zero + + ZIncreasingDownward=true in the raw file. The linked VerticalCrs is an + inline ``VerticalUnknownCrs`` placeholder that carries no direction field, + so the sentinel (None) correctly leaves the top-level value unchanged. + """ + + UUID = "0ae56ef3-fc79-405b-8deb-6942e0f2e77c" + + @pytest.fixture(scope="class") + def info(self, epc20): + obj = epc20.get_object_by_uuid(self.UUID)[0] + return extract_crs_info(obj) + + def test_source_type(self, info): + assert "LocalDepth3DCrs" in info.source_type + + def test_projected_epsg(self, info): + assert info.projected_epsg_code == 23031 + + def test_projected_uom(self, info): + assert info.projected_uom is not None + assert info.projected_uom.lower() == "m" + + def test_z_increasing_downward(self, info): + # ZIncreasingDownward=true in the raw file; VerticalUnknownCrs has no + # direction field so the sentinel leaves the parent value unchanged. + assert info.z_increasing_downward is True + + def test_offsets_zero(self, info): + assert info.x_offset == pytest.approx(0.0) + assert info.y_offset == pytest.approx(0.0) + assert info.z_offset == pytest.approx(0.0) + + +class TestV201LocalEngineeringCompoundCrs: + """ + LocalEngineeringCompoundCrs uuid=95330cec-164c-4165-9fb9-c56477ae7f8a + (EML v2.3 object inside the v2.0.1 EPC) + projected_epsg=23031 (only when workspace provided) + z_down=True azref=grid north + """ + + UUID = "95330cec-164c-4165-9fb9-c56477ae7f8a" + + def test_z_down_inline_no_workspace(self, epc20): + """VerticalAxis direction is readable without workspace.""" + obj = epc20.get_object_by_uuid(self.UUID)[0] + info = extract_crs_info(obj, workspace=None) + assert info.z_increasing_downward is True + + def test_projected_epsg_requires_workspace(self, epc20): + """EPSG is on linked LocalEngineering2DCrs — only available via workspace.""" + obj = epc20.get_object_by_uuid(self.UUID)[0] + info_no_ws = extract_crs_info(obj, workspace=None) + assert info_no_ws.projected_epsg_code is None + + info_ws = extract_crs_info(obj, workspace=epc20) + assert info_ws.projected_epsg_code == 23031 + + def test_full_resolution_with_workspace(self, epc20): + obj = epc20.get_object_by_uuid(self.UUID)[0] + info = extract_crs_info(obj, workspace=epc20) + assert info.projected_epsg_code == 23031 + assert info.projected_uom is not None + assert info.projected_uom.lower() == "m" + assert info.vertical_uom is not None + assert info.vertical_uom.lower() == "m" + assert info.z_increasing_downward is True + assert info.azimuth_reference == "grid north" + + +class TestV201LocalEngineering2DCrs: + """ + LocalEngineering2DCrs uuid=811f8e68-c0e4-5f90-b9cf-03f7e3d53ca4 + (EML v2.3 object inside the v2.0.1 EPC) + projected_epsg=23031 projected_uom=M azref=grid north offsets zero + """ + + UUID = "811f8e68-c0e4-5f90-b9cf-03f7e3d53ca4" + + @pytest.fixture(scope="class") + def info(self, epc20): + obj = epc20.get_object_by_uuid(self.UUID)[0] + return extract_crs_info(obj) + + def test_projected_epsg(self, info): + assert info.projected_epsg_code == 23031 + + def test_projected_uom(self, info): + assert info.projected_uom is not None + assert info.projected_uom.lower() == "m" + + def test_no_vertical_uom(self, info): + # 2D CRS carries no Z information + assert info.vertical_uom is None + + def test_z_increasing_downward(self, info): + assert info.z_increasing_downward is False + + def test_azimuth_reference(self, info): + assert info.azimuth_reference == "grid north" + + def test_offsets_zero(self, info): + assert info.x_offset == pytest.approx(0.0) + assert info.y_offset == pytest.approx(0.0) + + +class TestV201VerticalCrs: + """ + VerticalCrs uuid=1f6cf904-336c-5202-a13d-7c9b142cd406 + (EML v2.3 object inside the v2.0.1 EPC) + vertical_uom=M z_down=True no projected info + """ + + UUID = "1f6cf904-336c-5202-a13d-7c9b142cd406" + + @pytest.fixture(scope="class") + def info(self, epc20): + obj = epc20.get_object_by_uuid(self.UUID)[0] + return extract_crs_info(obj) + + def test_vertical_uom(self, info): + assert info.vertical_uom is not None + assert info.vertical_uom.lower() == "m" + + def test_z_increasing_downward(self, info): + assert info.z_increasing_downward is True + + def test_no_projected_info(self, info): + assert info.projected_epsg_code is None + assert info.projected_uom is None + + +class TestV201Grid2DCrsResolution: + """ + Grid2DRepresentation → local_crs DOR → resolved CRS → extract_crs_info. + """ + + def test_grid_030a82f6_resolves_to_local_time_crs(self, epc20): + grid = epc20.get_object_by_uuid("030a82f6-10a7-4ecf-af03-54749e098624")[0] + crs = _resolve_crs_from_grid(grid, epc20) + assert crs is not None + assert crs.uuid == "dbd637d5-4528-4145-908b-5f7136824f6d" + assert "LocalTime3DCrs" in type(crs).__name__ + info = extract_crs_info(crs, workspace=epc20) + assert info.x_offset == pytest.approx(1.0) + assert info.y_offset == pytest.approx(0.1) + assert info.z_offset == pytest.approx(15.0) + + def test_grid_aa5b90f1_resolves_to_local_depth_crs(self, epc20): + grid = epc20.get_object_by_uuid("aa5b90f1-2eab-4fa6-8720-69dd4fd51a4d")[0] + crs = _resolve_crs_from_grid(grid, epc20) + assert crs is not None + assert crs.uuid == "0ae56ef3-fc79-405b-8deb-6942e0f2e77c" + info = extract_crs_info(crs, workspace=epc20) + assert info.projected_epsg_code == 23031 + + def test_grid_4e56b0e4_resolves_to_same_depth_crs(self, epc20): + grid = epc20.get_object_by_uuid("4e56b0e4-2cd1-4efa-97dd-95f72bcf9f80")[0] + crs = _resolve_crs_from_grid(grid, epc20) + assert crs is not None + assert crs.uuid == "0ae56ef3-fc79-405b-8deb-6942e0f2e77c" + + +# =========================================================================== +# RESQML v2.2 / EML v2.3 — testingPackageCpp22.epc +# =========================================================================== + + +class TestV22LocalEngineering2DCrsNoEpsg: + """ + LocalEngineering2DCrs uuid=997796f5-da9d-5175-9fb7-e592957b73fb + x=1.0 y=0.1 projected_uom=M no EPSG azref=grid north + """ + + UUID = "997796f5-da9d-5175-9fb7-e592957b73fb" + + @pytest.fixture(scope="class") + def info(self, epc22): + obj = epc22.get_object_by_uuid(self.UUID)[0] + return extract_crs_info(obj) + + def test_offsets(self, info): + assert info.x_offset == pytest.approx(1.0) + assert info.y_offset == pytest.approx(0.1) + assert info.z_offset == pytest.approx(0.0) + + def test_no_epsg(self, info): + assert info.projected_epsg_code is None + + def test_projected_uom(self, info): + assert info.projected_uom is not None + assert info.projected_uom.lower() == "m" + + def test_azimuth_reference(self, info): + assert info.azimuth_reference == "grid north" + + def test_z_increasing_downward(self, info): + assert info.z_increasing_downward is False + + +class TestV22LocalEngineering2DCrsWithEpsg: + """ + LocalEngineering2DCrs uuid=671ffdeb-f25c-513a-a4a2-1774d3ac20c6 + projected_epsg=23031 projected_uom=M azref=grid north offsets zero + """ + + UUID = "671ffdeb-f25c-513a-a4a2-1774d3ac20c6" + + @pytest.fixture(scope="class") + def info(self, epc22): + obj = epc22.get_object_by_uuid(self.UUID)[0] + return extract_crs_info(obj) + + def test_projected_epsg(self, info): + assert info.projected_epsg_code == 23031 + + def test_projected_uom(self, info): + assert info.projected_uom is not None + assert info.projected_uom.lower() == "m" + + def test_azimuth_reference(self, info): + assert info.azimuth_reference == "grid north" + + def test_offsets_zero(self, info): + assert info.x_offset == pytest.approx(0.0) + assert info.y_offset == pytest.approx(0.0) + + +class TestV22CompoundCrsWithOffsets: + """ + LocalEngineeringCompoundCrs uuid=f0e9f421-b902-4392-87d8-6495c02f2fbe + Links to LocalEngineering2DCrs (997796f5) with x=1.0, y=0.1. + z=15.0 z_down=True no projected EPSG. + Note: the inline VerticalAxis uses a time UOM (S), the resolved + VerticalCrs uses depth UOM (M) — demonstrates with/without workspace. + """ + + UUID = "f0e9f421-b902-4392-87d8-6495c02f2fbe" + + def test_inline_z_offset_without_workspace(self, epc22): + obj = epc22.get_object_by_uuid(self.UUID)[0] + info = extract_crs_info(obj, workspace=None) + assert info.z_offset == pytest.approx(15.0) + + def test_inline_z_direction_without_workspace(self, epc22): + obj = epc22.get_object_by_uuid(self.UUID)[0] + info = extract_crs_info(obj, workspace=None) + assert info.z_increasing_downward is True + + def test_no_horizontal_info_without_workspace(self, epc22): + obj = epc22.get_object_by_uuid(self.UUID)[0] + info = extract_crs_info(obj, workspace=None) + assert info.projected_epsg_code is None + assert info.x_offset == pytest.approx(0.0) + assert info.y_offset == pytest.approx(0.0) + + def test_full_resolution_with_workspace(self, epc22): + obj = epc22.get_object_by_uuid(self.UUID)[0] + info = extract_crs_info(obj, workspace=epc22) + # Horizontal from linked LocalEngineering2DCrs (997796f5) + assert info.x_offset == pytest.approx(1.0) + assert info.y_offset == pytest.approx(0.1) + assert info.z_offset == pytest.approx(15.0) + assert info.projected_uom is not None + assert info.projected_uom.lower() == "m" + assert info.projected_epsg_code is None + assert info.z_increasing_downward is True + assert info.azimuth_reference == "grid north" + + def test_vertical_uom_resolved_from_vertical_crs(self, epc22): + """With workspace the vertical UOM comes from the linked VerticalCrs (M), not the inline time axis (S).""" + obj = epc22.get_object_by_uuid(self.UUID)[0] + info = extract_crs_info(obj, workspace=epc22) + assert info.vertical_uom is not None + assert info.vertical_uom.lower() == "m" + + +class TestV22CompoundCrsWithEpsg: + """ + LocalEngineeringCompoundCrs uuid=6a18c177-93be-41ac-9084-f84bbb31f46d + projected_epsg=23031 z_down=True all offsets zero vertical_uom=M + """ + + UUID = "6a18c177-93be-41ac-9084-f84bbb31f46d" + + @pytest.fixture(scope="class") + def info(self, epc22): + obj = epc22.get_object_by_uuid(self.UUID)[0] + return extract_crs_info(obj, workspace=epc22) + + def test_projected_epsg(self, info): + assert info.projected_epsg_code == 23031 + + def test_projected_uom(self, info): + assert info.projected_uom is not None + assert info.projected_uom.lower() == "m" + + def test_vertical_uom(self, info): + assert info.vertical_uom is not None + assert info.vertical_uom.lower() == "m" + + def test_z_increasing_downward(self, info): + assert info.z_increasing_downward is True + + def test_offsets_zero(self, info): + assert info.x_offset == pytest.approx(0.0) + assert info.y_offset == pytest.approx(0.0) + assert info.z_offset == pytest.approx(0.0) + + def test_azimuth_reference(self, info): + assert info.azimuth_reference == "grid north" + + +class TestV22VerticalCrs: + """ + Two standalone VerticalCrs objects in the v2.2 EPC. + Both: vertical_uom=M z_down=True + """ + + @pytest.mark.parametrize("uuid", [ + "65cd199f-156b-5112-ad3e-b4f54a2aa77b", + "355174db-6226-57ae-a5a6-92f33825fed4", + ]) + def test_vertical_uom_and_direction(self, uuid, epc22): + obj = epc22.get_object_by_uuid(uuid)[0] + info = extract_crs_info(obj) + assert info.vertical_uom is not None + assert info.vertical_uom.lower() == "m" + assert info.z_increasing_downward is True + assert info.projected_epsg_code is None + assert info.projected_uom is None + + +# =========================================================================== +# Legacy delegate functions (helper.py forwards to extract_crs_info) +# =========================================================================== + + +class TestDelegateFunctions: + """ + Verify the five legacy helpers in ``helper.py`` still work correctly now + that they delegate to ``extract_crs_info``. + + Uses LocalDepth3DCrs (0ae56ef3) and LocalTime3DCrs (dbd637d5) from epc20. + """ + + def test_is_z_reversed_depth_crs_true(self, epc20): + # LocalDepth3DCrs has ZIncreasingDownward=true; VerticalUnknownCrs sub-object + # carries no direction so the sentinel leaves the top-level value intact. + crs = epc20.get_object_by_uuid("0ae56ef3-fc79-405b-8deb-6942e0f2e77c")[0] + assert is_z_reversed(crs) is True + + def test_is_z_reversed_compound_crs_true(self, epc20): + # CompoundCrs 95330cec has z_increasing_downward=True + crs = epc20.get_object_by_uuid("95330cec-164c-4165-9fb9-c56477ae7f8a")[0] + assert is_z_reversed(crs) is True + + def test_is_z_reversed_none(self): + assert is_z_reversed(None) is False + + def test_get_projected_epsg_code(self, epc20): + crs = epc20.get_object_by_uuid("0ae56ef3-fc79-405b-8deb-6942e0f2e77c")[0] + assert get_projected_epsg_code(crs) == 23031 + + def test_get_projected_epsg_code_no_epsg(self, epc20): + crs = epc20.get_object_by_uuid("dbd637d5-4528-4145-908b-5f7136824f6d")[0] + assert get_projected_epsg_code(crs) is None + + def test_get_projected_uom(self, epc20): + crs = epc20.get_object_by_uuid("0ae56ef3-fc79-405b-8deb-6942e0f2e77c")[0] + uom = get_projected_uom(crs) + assert uom is not None + assert uom.lower() == "m" + + def test_get_vertical_epsg_code_none(self, epc20): + # Neither CRS in this EPC has a vertical EPSG code + crs = epc20.get_object_by_uuid("0ae56ef3-fc79-405b-8deb-6942e0f2e77c")[0] + assert get_vertical_epsg_code(crs) is None + + def test_get_crs_offsets_and_angle_local_time(self, epc20): + crs = epc20.get_object_by_uuid("dbd637d5-4528-4145-908b-5f7136824f6d")[0] + x, y, z, (angle, uom) = get_crs_offsets_and_angle(crs) + assert x == pytest.approx(1.0) + assert y == pytest.approx(0.1) + assert z == pytest.approx(15.0) + assert angle == pytest.approx(0.0) + + def test_get_crs_offsets_and_angle_none(self): + x, y, z, (angle, uom) = get_crs_offsets_and_angle(None) + assert x == 0.0 + assert y == 0.0 + assert z == 0.0 + assert angle == 0.0 + assert uom == "rad" + + +# --------------------------------------------------------------------------- +# Tests for apply_axis_order_swap +# --------------------------------------------------------------------------- + + +import numpy as np +from energyml.utils.data.crs import apply_axis_order_swap, apply_from_crs_info + + +class TestApplyAxisOrderSwap: + """Unit tests for :func:`apply_axis_order_swap`.""" + + def _pts(self) -> np.ndarray: + return np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float64) + + def test_none_axis_order_no_swap(self): + pts = self._pts() + result = apply_axis_order_swap(pts, None) + np.testing.assert_array_equal(result[:, 0], [1.0, 4.0]) + np.testing.assert_array_equal(result[:, 1], [2.0, 5.0]) + + def test_easting_northing_no_swap(self): + pts = self._pts() + result = apply_axis_order_swap(pts, "easting northing") + np.testing.assert_array_equal(result[:, 0], [1.0, 4.0]) + np.testing.assert_array_equal(result[:, 1], [2.0, 5.0]) + + def test_northing_easting_swaps_xy(self): + pts = self._pts() + result = apply_axis_order_swap(pts, "northing easting") + np.testing.assert_array_equal(result[:, 0], [2.0, 5.0]) + np.testing.assert_array_equal(result[:, 1], [1.0, 4.0]) + np.testing.assert_array_equal(result[:, 2], [3.0, 6.0]) + + def test_north_east_swaps_xy(self): + pts = self._pts() + result = apply_axis_order_swap(pts, "north east") + np.testing.assert_array_equal(result[:, 0], [2.0, 5.0]) + np.testing.assert_array_equal(result[:, 1], [1.0, 4.0]) + + def test_latitude_longitude_swaps_xy(self): + pts = self._pts() + result = apply_axis_order_swap(pts, "latitude longitude") + np.testing.assert_array_equal(result[:, 0], [2.0, 5.0]) + np.testing.assert_array_equal(result[:, 1], [1.0, 4.0]) + + def test_inplace_modification(self): + pts = self._pts() + original_id = id(pts) + result = apply_axis_order_swap(pts, "northing easting") + assert id(result) == original_id # same array object + + def test_z_column_untouched(self): + pts = self._pts() + apply_axis_order_swap(pts, "northing easting") + np.testing.assert_array_equal(pts[:, 2], [3.0, 6.0]) + + +# --------------------------------------------------------------------------- +# Tests for apply_from_crs_info +# --------------------------------------------------------------------------- + + +class TestApplyFromCrsInfo: + """Unit tests for :func:`apply_from_crs_info`.""" + + def _pts(self, x=1.0, y=2.0, z=3.0) -> np.ndarray: + return np.array([[x, y, z]], dtype=np.float64) + + # --- Translation ------------------------------------------------------- + + def test_translation_only(self): + # Translation is applied in the local z-down space, then Z is negated + # to produce z-up output (z_increasing_downward=True → flip). + info = CrsInfo(x_offset=10.0, y_offset=20.0, z_offset=5.0, z_increasing_downward=True) + pts = self._pts(1.0, 2.0, 3.0) + result = apply_from_crs_info(pts, info) + assert result[0, 0] == pytest.approx(11.0) + assert result[0, 1] == pytest.approx(22.0) + assert result[0, 2] == pytest.approx(-8.0) # 3+5=8, then flipped to z-up + + # --- Z-flip ------------------------------------------------------------ + + def test_no_z_flip_when_z_up_input(self): + """z_increasing_downward=False means the input is already z-up: no flip needed.""" + info = CrsInfo(z_increasing_downward=False) + pts = self._pts(0.0, 0.0, 5.0) + result = apply_from_crs_info(pts, info) + assert result[0, 2] == pytest.approx(5.0) + + def test_z_flip_when_z_down_input(self): + """z_increasing_downward=True means input is depth-positive: negate to z-up output.""" + info = CrsInfo(z_increasing_downward=True) + pts = self._pts(0.0, 0.0, 5.0) + result = apply_from_crs_info(pts, info) + assert result[0, 2] == pytest.approx(-5.0) + + # --- Clockwise rotation ------------------------------------------------ + + def test_rotation_90_degrees_cw(self): + """90° CW rotation: (1, 0) → (0, -1) [y' = -x·sin + y·cos].""" + info = CrsInfo(areal_rotation_value=90.0, areal_rotation_uom="degr", z_increasing_downward=True) + pts = self._pts(1.0, 0.0, 0.0) + result = apply_from_crs_info(pts, info) + # CW 90°: x' = x·cos(90) + y·sin(90) = 0 + 0 = 0 + # y' = -x·sin(90) + y·cos(90) = -1 + 0 = -1 + assert result[0, 0] == pytest.approx(0.0, abs=1e-10) + assert result[0, 1] == pytest.approx(-1.0, abs=1e-10) + + def test_rotation_45_degrees_cw(self): + """45° CW rotation of (1, 1) → (√2, 0) in depth z convention.""" + info = CrsInfo(areal_rotation_value=45.0, areal_rotation_uom="degr", z_increasing_downward=True) + pts = self._pts(1.0, 1.0, 0.0) + result = apply_from_crs_info(pts, info) + sqrt2 = math.sqrt(2.0) + assert result[0, 0] == pytest.approx(sqrt2, abs=1e-10) + assert result[0, 1] == pytest.approx(0.0, abs=1e-10) + + def test_zero_rotation_no_change(self): + info = CrsInfo(areal_rotation_value=0.0, z_increasing_downward=True) + pts = self._pts(3.0, 4.0, 0.0) + result = apply_from_crs_info(pts, info) + assert result[0, 0] == pytest.approx(3.0) + assert result[0, 1] == pytest.approx(4.0) + + # --- Axis-order swap --------------------------------------------------- + + def test_northing_first_axis_order_swaps_xy(self): + info = CrsInfo(projected_axis_order="northing easting", z_increasing_downward=True) + pts = self._pts(10.0, 20.0, 0.0) + result = apply_from_crs_info(pts, info) + assert result[0, 0] == pytest.approx(20.0) + assert result[0, 1] == pytest.approx(10.0) + + # --- inplace=False ---------------------------------------------------- + + def test_inplace_false_returns_copy(self): + info = CrsInfo(x_offset=5.0, z_increasing_downward=True) + pts = self._pts(1.0, 2.0, 3.0) + original = pts.copy() + result = apply_from_crs_info(pts, info, inplace=False) + # Original must be unchanged + np.testing.assert_array_equal(pts, original) + # Result must be translated + assert result[0, 0] == pytest.approx(6.0) + + # --- AzimuthReference warning ----------------------------------------- + + def test_true_north_azimuth_reference_warns(self, caplog): + import logging + info = CrsInfo(azimuth_reference="true north", z_increasing_downward=True) + pts = self._pts() + with caplog.at_level(logging.WARNING, logger="energyml.utils.data.crs"): + apply_from_crs_info(pts, info) + assert any("true north" in r.message.lower() for r in caplog.records) + + def test_magnetic_north_azimuth_reference_warns(self, caplog): + import logging + info = CrsInfo(azimuth_reference="magnetic north", z_increasing_downward=True) + pts = self._pts() + with caplog.at_level(logging.WARNING, logger="energyml.utils.data.crs"): + apply_from_crs_info(pts, info) + assert any("magnetic north" in r.message.lower() for r in caplog.records) + + def test_grid_north_no_warning(self, caplog): + import logging + info = CrsInfo(azimuth_reference="grid north", z_increasing_downward=True) + pts = self._pts() + with caplog.at_level(logging.WARNING, logger="energyml.utils.data.crs"): + apply_from_crs_info(pts, info) + assert not any("north" in r.message.lower() for r in caplog.records) + + # --- Full pipeline order verification --------------------------------- + + def test_pipeline_order_rotation_then_translation(self): + """Rotation must be applied BEFORE translation. + + Rotate (0, 1) by 90° CW → (1, 0), then translate by (10, 0): + result should be (11, 0), NOT (0, -1 + 10) = (0, 9). + """ + info = CrsInfo( + areal_rotation_value=90.0, + areal_rotation_uom="degr", + x_offset=10.0, + z_increasing_downward=True, + ) + pts = np.array([[0.0, 1.0, 0.0]], dtype=np.float64) + result = apply_from_crs_info(pts, info) + # CW 90°: x' = 0*cos90 + 1*sin90 = 1, then +10 → 11 + # y' = -0*sin90 + 1*cos90 = 0, then +0 → 0 + assert result[0, 0] == pytest.approx(11.0, abs=1e-10) + assert result[0, 1] == pytest.approx(0.0, abs=1e-10) + diff --git a/energyml-utils/tests/test_epc.py b/energyml-utils/tests/test_epc.py index de6ea53..643a6c3 100644 --- a/energyml-utils/tests/test_epc.py +++ b/energyml-utils/tests/test_epc.py @@ -1,197 +1,1068 @@ # Copyright (c) 2023-2024 Geosiris. # SPDX-License-Identifier: Apache-2.0 +""" +Comprehensive unit tests for Epc class functionality. + +Tests cover: +1. Object lifecycle (add, get, remove) +2. Export functionality (export_file, export_io) +3. Relationship computation (compute_rels) - only at export time +4. HDF5 array operations (write_array, read_array) +5. DOR creation and handling (as_dor) +6. File path generation (gen_energyml_object_path) +7. External files and raw files handling +""" +import os +import tempfile + +import pytest +import numpy as np + from energyml.eml.v2_0.commonv2 import Citation as Citation20 -from energyml.eml.v2_0.commonv2 import ( - DataObjectReference as DataObjectReference201, +from energyml.eml.v2_0.commonv2 import DataObjectReference as DataObjectReference201, EpcExternalPartReference +from energyml.eml.v2_3.commonv2 import Citation, DataObjectReference, ExternalDataArray, ExternalDataArrayPart +from energyml.resqml.v2_0_1.resqmlv2 import ( + FaultInterpretation, + TriangulatedSetRepresentation as TriangulatedSetRepresentation20, + TrianglePatch as TrianglePatch20, + PointGeometry as PointGeometry20, +) +from energyml.resqml.v2_2.resqmlv2 import ( + TriangulatedSetRepresentation, + BoundaryFeatureInterpretation, + BoundaryFeature, + HorizonInterpretation, + TrianglePatch, + PointGeometry, + Point3DExternalArray, ) -from energyml.eml.v2_3.commonv2 import Citation -from energyml.eml.v2_3.commonv2 import DataObjectReference -from energyml.resqml.v2_0_1.resqmlv2 import FaultInterpretation -from energyml.resqml.v2_2.resqmlv2 import TriangulatedSetRepresentation from energyml.utils.epc import ( + Epc, as_dor, - get_obj_identifier, - gen_energyml_object_path, EpcExportVersion, ) +from energyml.utils.epc_utils import gen_energyml_object_path from energyml.utils.introspection import ( epoch_to_date, epoch, gen_uuid, get_content_type_from_class, - get_obj_pkg_pkgv_type_uuid_version, get_obj_uri, get_qualified_type_from_class, - set_attribute_from_path, -) - -fi_cit = Citation20( - title="An interpretation", - originator="Valentin", - creation=epoch_to_date(epoch()), - editor="test", - format="Geosiris", - last_update=epoch_to_date(epoch()), + get_obj_identifier, ) +from energyml.utils.constants import EPCRelsRelationshipType, MimeType -fi = FaultInterpretation( - citation=fi_cit, - uuid=gen_uuid(), - object_version="0", -) +CST_H5_PATH = "my_h5_filepath.h5" -tr_cit = Citation( - title="--", - # title="test title", - originator="Valentin", - creation=epoch_to_date(epoch()), - editor="test", - format="Geosiris", - last_update=epoch_to_date(epoch()), -) -dor = DataObjectReference( - uuid=fi.uuid, - title="a DOR title", - object_version="0", - qualified_type="a wrong qualified type", -) +@pytest.fixture +def temp_epc_file(): + """Create a temporary EPC file path for testing.""" + fd, temp_path = tempfile.mkstemp(suffix=".epc") + os.close(fd) + os.unlink(temp_path) -dor_correct20 = DataObjectReference201( - uuid=fi.uuid, - title="a DOR title", - content_type="application/x-resqml+xml;version=2.0;type=obj_FaultInterpretation", - version_string="0", -) + yield temp_path -dor_correct23 = DataObjectReference( - uuid=fi.uuid, - title="a DOR title", - object_version="0", - qualified_type="resqml20.obj_FaultInterpretation", -) + if os.path.exists(temp_path): + os.unlink(temp_path) -tr = TriangulatedSetRepresentation( - citation=tr_cit, - uuid=gen_uuid(), - represented_object=dor_correct23, -) -tr_versioned = TriangulatedSetRepresentation( - citation=tr_cit, - uuid=gen_uuid(), - represented_object=dor_correct23, - object_version="3", -) +@pytest.fixture +def sample_objects(): + """Create sample EnergyML objects for testing.""" + # Create a BoundaryFeature + bf = BoundaryFeature( + citation=Citation( + title="Test Boundary Feature", + originator="Test", + creation=epoch_to_date(epoch()), + ), + uuid="25773477-ffee-4cc2-867d-000000000001", + object_version="1.0", + ) -def test_get_obj_identifier(): - assert get_obj_identifier(tr) == tr.uuid + "." - assert get_obj_identifier(fi) == fi.uuid + ".0" - assert get_obj_identifier(dor_correct20) == dor_correct20.uuid + ".0" - assert get_obj_identifier(dor_correct23) == dor_correct23.uuid + ".0" - - -def test_get_obj_pkg_pkgv_type_uuid_version_obj_201(): - ( - domain, - domain_version, - object_type, - obj_uuid, - obj_version, - ) = get_obj_pkg_pkgv_type_uuid_version(fi) - assert domain == "resqml" - assert domain_version == "20" - assert object_type == "obj_FaultInterpretation" - assert obj_uuid == fi.uuid - assert obj_version == fi.object_version - - -def test_get_obj_pkg_pkgv_type_uuid_version_obj_22(): - ( - domain, - domain_version, - object_type, - obj_uuid, - obj_version, - ) = get_obj_pkg_pkgv_type_uuid_version(tr) - assert domain == "resqml" - assert domain_version == "22" - assert object_type == "TriangulatedSetRepresentation" - assert obj_uuid == tr.uuid - assert obj_version == tr.object_version - - -def test_get_obj_uri(): - assert str(get_obj_uri(tr)) == f"eml:///resqml22.TriangulatedSetRepresentation({tr.uuid})" - assert ( - str(get_obj_uri(tr, "/MyDataspace/")) - == f"eml:///dataspace('/MyDataspace/')/resqml22.TriangulatedSetRepresentation({tr.uuid})" + # Create a BoundaryFeatureInterpretation + bfi = BoundaryFeatureInterpretation( + citation=Citation( + title="Test Boundary Feature Interpretation", + originator="Test", + creation=epoch_to_date(epoch()), + ), + uuid="25773477-ffee-4cc2-867d-000000000002", + object_version="1.0", + interpreted_feature=as_dor(bf), ) - assert ( - str(get_obj_uri(fi)) == f"eml:///resqml20.obj_FaultInterpretation(uuid={fi.uuid},version='{fi.object_version}')" + # Create a HorizonInterpretation + horizon_interp = HorizonInterpretation( + citation=Citation( + title="Test HorizonInterpretation", + originator="Test", + creation=epoch_to_date(epoch()), + ), + interpreted_feature=as_dor(bf), + uuid="25773477-ffee-4cc2-867d-000000000003", + object_version="1.0", ) - assert ( - str(get_obj_uri(fi, "/MyDataspace/")) - == f"eml:///dataspace('/MyDataspace/')/resqml20.obj_FaultInterpretation(uuid={fi.uuid},version='{fi.object_version}')" + + # Create a TriangulatedSetRepresentation + trset = TriangulatedSetRepresentation( + citation=Citation( + title="Test TriangulatedSetRepresentation", + originator="Test", + creation=epoch_to_date(epoch()), + ), + uuid="25773477-ffee-4cc2-867d-000000000004", + object_version="1.0", + represented_object=as_dor(horizon_interp), + triangle_patch=[ + TrianglePatch( + geometry=PointGeometry( + points=Point3DExternalArray( + coordinates=ExternalDataArray( + external_data_array_part=[ + ExternalDataArrayPart( + path_in_external_file="/points", uri=CST_H5_PATH, mime_type=MimeType.HDF5.value + ) + ] + ) + ) + ) + ) + ], ) + # Resqml 2.0.1 FaultInterpretation for additional tests + fi_cit = Citation20( + title="An interpretation", + originator="Valentin", + creation=epoch_to_date(epoch()), + editor="test", + format="Geosiris", + last_update=epoch_to_date(epoch()), + ) -def test_gen_energyml_object_path(): - assert gen_energyml_object_path(tr) == f"TriangulatedSetRepresentation_{tr.uuid}.xml" - assert ( - gen_energyml_object_path(tr, EpcExportVersion.EXPANDED) - == f"namespace_resqml22/TriangulatedSetRepresentation_{tr.uuid}.xml" + fi = FaultInterpretation( + citation=fi_cit, + uuid=gen_uuid(), + object_version="0", ) + # 201 + external_ref = EpcExternalPartReference( + uuid="25773477-ffee-4cc2-867d-000000000005", + citation=Citation20(title="An external reference", originator="Valentin", creation=epoch_to_date(epoch())), + ) -def test_gen_energyml_object_path_versioned(): - assert gen_energyml_object_path(tr_versioned) == f"TriangulatedSetRepresentation_{tr_versioned.uuid}.xml" - assert ( - gen_energyml_object_path(tr_versioned, EpcExportVersion.EXPANDED) - == f"namespace_resqml22/version_{tr_versioned.object_version}/TriangulatedSetRepresentation_{tr_versioned.uuid}.xml" + tr_set_20 = TriangulatedSetRepresentation20( + citation=Citation20( + title="Test TriangulatedSetRepresentation 2.0", originator="Test", creation=epoch_to_date(epoch()) + ), + uuid="25773477-ffee-4cc2-867d-000000000006", + object_version="1.0", + represented_interpretation=as_dor(horizon_interp, "eml20.DataObjectReference"), + triangle_patch=[ + TrianglePatch20(geometry=PointGeometry20(local_crs=as_dor(external_ref, "eml20.DataObjectReference"))) + ], ) + return { + "bf": bf, + "bfi": bfi, + "trset": trset, + "trset20": tr_set_20, + "external_ref": external_ref, + "horizon_interp": horizon_interp, + "fi": fi, + } -def test_as_dor_object(): - dor_fi = as_dor(fi) - assert dor_fi.title == fi.citation.title - assert dor_fi.uuid == fi.uuid - assert dor_fi.qualified_type == get_qualified_type_from_class(fi) +class TestObjectLifecycle: + """Test basic object lifecycle operations.""" + def test_add_object(self, sample_objects): + """Test adding objects to Epc.""" + epc = Epc() + bf = sample_objects["bf"] -def test_as_dor_another_dor(): - dor_dor20 = as_dor(dor_correct20, "eml20.DataObjectReference") - assert dor_dor20.title == dor_correct20.title - assert dor_dor20.uuid == fi.uuid - assert dor_dor20.content_type == get_content_type_from_class(fi) + result = epc.add_object(bf) + assert result is True + assert len(epc.energyml_objects) == 1 + assert bf in epc.energyml_objects - dor_dor20_bis = as_dor(dor_correct23, "eml20.DataObjectReference") - assert dor_dor20_bis.title == dor_correct23.title - assert dor_dor20_bis.uuid == fi.uuid - assert dor_dor20_bis.content_type == get_content_type_from_class(fi) + def test_add_multiple_objects(self, sample_objects): + """Test adding multiple objects.""" + epc = Epc() - dor_dor23 = as_dor(dor_correct20, "eml23.DataObjectReference") - assert dor_dor23.title == dor_correct20.title - assert dor_dor23.uuid == fi.uuid - assert dor_dor23.qualified_type == get_qualified_type_from_class(fi) + epc.add_object(sample_objects["bf"]) + epc.add_object(sample_objects["bfi"]) + epc.add_object(sample_objects["horizon_interp"]) + epc.add_object(sample_objects["trset"]) + assert len(epc) == 4 + assert len(epc.energyml_objects) == 4 -def test_as_dor_uri(): - dor_dor20 = as_dor( - "eml:///dataspace('test')/resqml22.TriangulatedSetRepresentation(0a2ba9e1-1018-4bfd-8fec-1c8cef13fa52)", - "eml20.DataObjectReference", - ) - assert dor_dor20.title is None - assert dor_dor20.uuid == "0a2ba9e1-1018-4bfd-8fec-1c8cef13fa52" - assert dor_dor20.content_type == "application/x-resqml+xml;version=2.2;type=TriangulatedSetRepresentation" + def test_get_object_by_identifier(self, sample_objects): + """Test retrieving object by identifier.""" + epc = Epc() + bf = sample_objects["bf"] - dor_dor23 = as_dor( - "eml:///dataspace('test')/resqml22.TriangulatedSetRepresentation(0a2ba9e1-1018-4bfd-8fec-1c8cef13fa52)", - "eml23.DataObjectReference", - ) - assert dor_dor23.title is None - assert dor_dor23.uuid == "0a2ba9e1-1018-4bfd-8fec-1c8cef13fa52" - assert dor_dor23.qualified_type == "resqml22.TriangulatedSetRepresentation" + epc.add_object(bf) + identifier = get_obj_identifier(bf) + + retrieved = epc.get_object_by_identifier(identifier) + assert retrieved is not None + assert retrieved.uuid == bf.uuid + + def test_get_object_by_uuid(self, sample_objects): + """Test retrieving objects by UUID.""" + epc = Epc() + bf = sample_objects["bf"] + bfi = sample_objects["bfi"] + + epc.add_object(bf) + epc.add_object(bfi) + + results = epc.get_object_by_uuid(bf.uuid) + assert len(results) == 1 + assert results[0].uuid == bf.uuid + + def test_remove_object(self, sample_objects): + """Test removing objects from Epc.""" + epc = Epc() + bf = sample_objects["bf"] + + epc.add_object(bf) + assert len(epc) == 1 + + identifier = get_obj_identifier(bf) + epc.remove_object(identifier) + + assert len(epc) == 0 + assert bf not in epc.energyml_objects + + def test_len(self, sample_objects): + """Test __len__ method.""" + epc = Epc() + assert len(epc) == 0 + + epc.add_object(sample_objects["bf"]) + assert len(epc) == 1 + + epc.add_object(sample_objects["bfi"]) + assert len(epc) == 2 + + +class TestExportFunctionality: + """Test export operations.""" + + def test_export_file(self, temp_epc_file, sample_objects): + """Test exporting Epc to file.""" + epc = Epc() + epc.add_object(sample_objects["bf"]) + epc.add_object(sample_objects["bfi"]) + + epc.export_file(temp_epc_file) + + assert os.path.exists(temp_epc_file) + assert os.path.getsize(temp_epc_file) > 0 + + def test_export_and_reload(self, temp_epc_file, sample_objects): + """Test exporting and reloading an Epc file.""" + epc = Epc() + bf = sample_objects["bf"] + bfi = sample_objects["bfi"] + + epc.add_object(bf) + epc.add_object(bfi) + epc.export_file(temp_epc_file) + + # Reload + epc2 = Epc.read_file(temp_epc_file) + assert len(epc2) == 2 + + # Verify objects are present + bf_retrieved = epc2.get_object_by_uuid(bf.uuid) + assert len(bf_retrieved) == 1 + assert bf_retrieved[0].citation.title == bf.citation.title + + def test_export_io(self, sample_objects): + """Test exporting to BytesIO.""" + epc = Epc() + epc.add_object(sample_objects["bf"]) + epc.add_object(sample_objects["bfi"]) + + io = epc.export_io() + + assert io is not None + assert io.tell() > 0 # Check that data was written + + # Try to read it back + io.seek(0) + epc2 = Epc.read_stream(io) + assert len(epc2) == 2 + + +class TestRelationships: + """Test relationship computation - Epc only computes rels at export time.""" + + def test_compute_rels_basic(self, sample_objects): + """Test basic relationship computation.""" + epc = Epc() + bf = sample_objects["bf"] + bfi = sample_objects["bfi"] + + epc.add_object(bf) + epc.add_object(bfi) + + # Compute relationships + rels_dict = epc.compute_rels() + + assert rels_dict is not None + assert len(rels_dict) > 0 + + # Check that relationships were computed + # compute_rels returns dict with rels paths as keys, not identifiers + assert any("BoundaryFeatureInterpretation" in key for key in rels_dict.keys()) + + def test_compute_rels_complex_chain(self, sample_objects): + """Test relationship computation with object chain.""" + epc = Epc() + bf = sample_objects["bf"] + bfi = sample_objects["bfi"] + horizon_interp = sample_objects["horizon_interp"] + trset = sample_objects["trset"] + + epc.add_object(bf) + epc.add_object(bfi) + epc.add_object(horizon_interp) + epc.add_object(trset) + + rels_dict = epc.compute_rels() + + # Verify relationships exist + assert len(rels_dict) >= 3 # At least 3 objects should have rels + + # Check specific relationships + bfi_id = get_obj_identifier(bfi) + if bfi_id in rels_dict: + bfi_rels = rels_dict[bfi_id] + dest_rels = [ + r for r in bfi_rels.relationship if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT) + ] + assert len(dest_rels) >= 1 + + def test_trset_to_interpretation_destination_relationship(self, sample_objects): + """Test that TriangulatedSetRepresentation has DESTINATION_OBJECT relationship to interpretation.""" + epc = Epc() + bf = sample_objects["bf"] + horizon_interp = sample_objects["horizon_interp"] + trset = sample_objects["trset"] + + epc.add_object(bf) + epc.add_object(horizon_interp) + epc.add_object(trset) + + rels_dict = epc.compute_rels() + + # Get trset rels path + trset_path = gen_energyml_object_path(trset, epc.export_version) + trset_rels_path = f"_rels/{trset_path}.rels" + + assert trset_rels_path in rels_dict + trset_rels = rels_dict[trset_rels_path] + + # Check for DESTINATION_OBJECT relationship to horizon_interp + dest_rels = [ + r for r in trset_rels.relationship if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT) + ] + assert len(dest_rels) >= 1 + + # Verify target points to horizon_interp + horizon_interp_path = gen_energyml_object_path(horizon_interp, epc.export_version) + assert any(horizon_interp_path in r.target for r in dest_rels) + + def test_interpretation_has_source_and_destination_relationships(self, sample_objects): + """Test that interpretation has SOURCE_OBJECT from trset and DESTINATION_OBJECT to feature.""" + epc = Epc() + bf = sample_objects["bf"] + horizon_interp = sample_objects["horizon_interp"] + trset = sample_objects["trset"] + + epc.add_object(bf) + epc.add_object(horizon_interp) + epc.add_object(trset) + + rels_dict = epc.compute_rels() + + # Get interpretation rels path + interp_path = gen_energyml_object_path(horizon_interp, epc.export_version) + interp_rels_path = f"_rels/{interp_path}.rels" + + assert interp_rels_path in rels_dict + interp_rels = rels_dict[interp_rels_path] + + # Check for SOURCE_OBJECT relationship from trset + source_rels = [ + r for r in interp_rels.relationship if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT) + ] + assert len(source_rels) >= 1 + + # Verify source points to trset + trset_path = gen_energyml_object_path(trset, epc.export_version) + assert any(trset_path in r.target for r in source_rels) + + # Check for DESTINATION_OBJECT relationship to feature + dest_rels = [ + r for r in interp_rels.relationship if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT) + ] + assert len(dest_rels) >= 1 + + # Verify target points to bf + bf_path = gen_energyml_object_path(bf, epc.export_version) + assert any(bf_path in r.target for r in dest_rels) + + def test_feature_has_source_relationship_from_interpretation(self, sample_objects): + """Test that feature has SOURCE_OBJECT relationship from interpretation.""" + epc = Epc() + bf = sample_objects["bf"] + horizon_interp = sample_objects["horizon_interp"] + + epc.add_object(bf) + epc.add_object(horizon_interp) + + rels_dict = epc.compute_rels() + + # Get feature rels path + bf_path = gen_energyml_object_path(bf, epc.export_version) + bf_rels_path = f"_rels/{bf_path}.rels" + + assert bf_rels_path in rels_dict + bf_rels = rels_dict[bf_rels_path] + + # Check for SOURCE_OBJECT relationship from interpretation + source_rels = [r for r in bf_rels.relationship if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] + assert len(source_rels) >= 1 + + # Verify source points to horizon_interp + interp_path = gen_energyml_object_path(horizon_interp, epc.export_version) + assert any(interp_path in r.target for r in source_rels) + + def test_external_part_reference_relationships(self, sample_objects): + """Test external part reference has EXTERNAL_PART_PROXY_TO_ML to trset20.""" + epc = Epc() + external_ref = sample_objects["external_ref"] + trset20 = sample_objects["trset20"] + horizon_interp = sample_objects["horizon_interp"] + bf = sample_objects["bf"] + + epc.add_object(bf) + epc.add_object(horizon_interp) + epc.add_object(external_ref) + epc.add_object(trset20) + + rels_dict = epc.compute_rels() + + # Get external_ref rels path + external_ref_path = gen_energyml_object_path(external_ref, epc.export_version) + external_ref_rels_path = f"_rels/{external_ref_path}.rels" + + assert external_ref_rels_path in rels_dict + external_ref_rels = rels_dict[external_ref_rels_path] + + # Check for EXTERNAL_PART_PROXY_TO_ML relationship + proxy_to_ml_rels = [ + r + for r in external_ref_rels.relationship + if r.type_value == str(EPCRelsRelationshipType.EXTERNAL_PART_PROXY_TO_ML) + ] + assert len(proxy_to_ml_rels) >= 1 + + # Verify target points to trset20 + trset20_path = gen_energyml_object_path(trset20, epc.export_version) + assert any(trset20_path in r.target for r in proxy_to_ml_rels) + + def test_external_data_array_part_rels_detection(self, sample_objects): + """Test that ExternalDataArrayPart relationships are detected.""" + from energyml.opc.opc import TargetMode + + epc = Epc() + trset22 = sample_objects["trset"] + horizon_interp = sample_objects["horizon_interp"] + + epc.add_object(horizon_interp) + epc.add_object(trset22) + + trset22_external_rels = [ + r for r in epc.get_obj_rels(trset22) if r.type_value == str(EPCRelsRelationshipType.EXTERNAL_RESOURCE) + ] + assert len(trset22_external_rels) == 1 + assert trset22_external_rels[0].target == CST_H5_PATH + assert trset22_external_rels[0].target_mode == TargetMode.EXTERNAL + + def test_trset20_has_ml_to_external_part_proxy_relationship(self, sample_objects): + """Test that trset20 has ML_TO_EXTERNAL_PART_PROXY relationship to external_ref.""" + epc = Epc() + external_ref = sample_objects["external_ref"] + trset20 = sample_objects["trset20"] + horizon_interp = sample_objects["horizon_interp"] + bf = sample_objects["bf"] + + epc.add_object(bf) + epc.add_object(horizon_interp) + epc.add_object(external_ref) + epc.add_object(trset20) + + rels_dict = epc.compute_rels() + + # Get trset20 rels path + trset20_path = gen_energyml_object_path(trset20, epc.export_version) + trset20_rels_path = f"_rels/{trset20_path}.rels" + + assert trset20_rels_path in rels_dict + trset20_rels = rels_dict[trset20_rels_path] + + # Check for ML_TO_EXTERNAL_PART_PROXY relationship + ml_to_proxy_rels = [ + r + for r in trset20_rels.relationship + if r.type_value == str(EPCRelsRelationshipType.ML_TO_EXTERNAL_PART_PROXY) + ] + assert len(ml_to_proxy_rels) >= 1 + + # Verify target points to external_ref + external_ref_path = gen_energyml_object_path(external_ref, epc.export_version) + assert any(external_ref_path in r.target for r in ml_to_proxy_rels) + + def test_complete_relationship_chain(self, sample_objects): + """Test complete relationship chain: trset -> interp -> feature.""" + epc = Epc() + bf = sample_objects["bf"] + horizon_interp = sample_objects["horizon_interp"] + trset = sample_objects["trset"] + + epc.add_object(bf) + epc.add_object(horizon_interp) + epc.add_object(trset) + + rels_dict = epc.compute_rels() + + # Verify all three objects have relationship files + trset_path = gen_energyml_object_path(trset, epc.export_version) + interp_path = gen_energyml_object_path(horizon_interp, epc.export_version) + bf_path = gen_energyml_object_path(bf, epc.export_version) + + assert f"_rels/{trset_path}.rels" in rels_dict + assert f"_rels/{interp_path}.rels" in rels_dict + assert f"_rels/{bf_path}.rels" in rels_dict + + def test_get_obj_rels_after_compute(self, sample_objects): + """Test get_obj_rels after explicit compute_rels call.""" + epc = Epc() + bf = sample_objects["bf"] + bfi = sample_objects["bfi"] + + epc.add_object(bf) + epc.add_object(bfi) + + # Compute rels explicitly + epc.compute_rels() + + # Now we can get rels + bfi_rels = epc.get_obj_rels(bfi) + assert bfi_rels is not None + + def test_relationships_in_exported_file(self, temp_epc_file, sample_objects): + """Test that relationships are correctly written to exported file.""" + epc = Epc() + bf = sample_objects["bf"] + bfi = sample_objects["bfi"] + + epc.add_object(bf) + epc.add_object(bfi) + epc.export_file(temp_epc_file) + + # Reload and check relationships + epc2 = Epc.read_file(temp_epc_file) + + # After reload, relationships are stored in additional_rels + assert len(epc2) == 2 + + def test_relationships_in_exported_file_parallel(self, temp_epc_file, sample_objects): + """Test that relationships are correctly written to exported file.""" + epc = Epc() + bf = sample_objects["bf"] + bfi = sample_objects["bfi"] + + epc.add_object(bf) + epc.add_object(bfi) + epc.export_file(temp_epc_file) + + # Reload and check relationships + epc2 = Epc.read_file(temp_epc_file, read_parallel=True) + + # After reload, relationships are stored in additional_rels + assert len(epc2) == 2 + + +class TestDORCreation: + """Test DataObjectReference creation.""" + + def test_as_dor_from_object(self, sample_objects): + """Test creating DOR from energyml object.""" + bf = sample_objects["bf"] + dor = as_dor(bf) + + assert dor.uuid == bf.uuid + assert dor.title == bf.citation.title + assert dor.qualified_type == get_qualified_type_from_class(bf) + + def test_as_dor_v20_from_object(self, sample_objects): + """Test creating v2.0 DOR from energyml object.""" + bf = sample_objects["bf"] + dor = as_dor(bf, "eml20.DataObjectReference") + + assert isinstance(dor, DataObjectReference201) + assert dor.uuid == bf.uuid + assert dor.content_type == get_content_type_from_class(bf) + + def test_as_dor_from_dor(self): + """Test creating DOR from another DOR.""" + dor_correct20 = DataObjectReference201( + uuid="25773477-ffee-4cc2-867d-000000000001", + title="a DOR title", + content_type="application/x-resqml+xml;version=2.2;type=BoundaryFeature", + version_string="1.0", + ) + + dor_23 = as_dor(dor_correct20, "eml23.DataObjectReference") + assert dor_23.uuid == dor_correct20.uuid + assert dor_23.title == dor_correct20.title + assert isinstance(dor_23, DataObjectReference) + + def test_as_dor_from_uri(self): + """Test creating DOR from URI string.""" + uri_str = "eml:///resqml22.TriangulatedSetRepresentation(0a2ba9e1-1018-4bfd-8fec-1c8cef13fa52)" + + dor_20 = as_dor(uri_str, "eml20.DataObjectReference") + assert dor_20.uuid == "0a2ba9e1-1018-4bfd-8fec-1c8cef13fa52" + assert dor_20.content_type == "application/x-resqml+xml;version=2.2;type=TriangulatedSetRepresentation" + + dor_23 = as_dor(uri_str, "eml23.DataObjectReference") + assert dor_23.uuid == "0a2ba9e1-1018-4bfd-8fec-1c8cef13fa52" + assert dor_23.qualified_type == "resqml22.TriangulatedSetRepresentation" + + +class TestFilePathGeneration: + """Test file path generation for objects.""" + + def test_gen_energyml_object_path_classic(self, sample_objects): + """Test path generation with CLASSIC export version.""" + trset = sample_objects["trset"] + + path = gen_energyml_object_path(trset, EpcExportVersion.CLASSIC) + assert path == f"TriangulatedSetRepresentation_{trset.uuid}.xml" + + def test_gen_energyml_object_path_expanded(self, sample_objects): + """Test path generation with EXPANDED export version.""" + trset = sample_objects["trset"] + + path = gen_energyml_object_path(trset, EpcExportVersion.EXPANDED) + expected = f"namespace_resqml22/version_{trset.object_version}/TriangulatedSetRepresentation_{trset.uuid}.xml" + assert path == expected + + def test_gen_energyml_object_path_no_version(self, sample_objects): + """Test path generation for object without explicit version.""" + bf = sample_objects["bf"] + + # For objects with object_version + path = gen_energyml_object_path(bf, EpcExportVersion.CLASSIC) + assert path == f"BoundaryFeature_{bf.uuid}.xml" + + +class TestHDF5Operations: + """Test HDF5 array operations.""" + + def test_write_and_read_array(self, temp_epc_file, sample_objects): + """Test writing and reading arrays.""" + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".h5") as f: + h5_path = f.name + + try: + epc = Epc(force_h5_path=h5_path) + trset = sample_objects["trset"] + + epc.add_object(trset) + + # Write array + test_array = np.arange(20).reshape((4, 5)) + success = epc.write_array(trset, "/test_dataset", test_array) + assert success + + # Read array back + read_array = epc.read_array(trset, "/test_dataset") + assert read_array is not None + assert np.array_equal(read_array, test_array) + + finally: + import time + + time.sleep(0.1) + if os.path.exists(h5_path): + try: + os.unlink(h5_path) + except PermissionError: + pass + + def test_write_array_creates_h5_rel(self, sample_objects): + """Test that writing array creates proper H5 relationship.""" + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".h5") as f: + h5_path = f.name + + try: + epc = Epc(force_h5_path=h5_path) + trset = sample_objects["trset"] + + epc.add_object(trset) + + test_array = np.array([1, 2, 3, 4, 5]) + epc.write_array(trset, "/dataset", test_array) + + # Check H5 file paths + h5_paths = epc.get_h5_file_paths(trset) + assert len(h5_paths) > 0 + + finally: + import time + + time.sleep(0.1) + if os.path.exists(h5_path): + try: + os.unlink(h5_path) + except PermissionError: + pass + + def test_multiple_arrays(self, sample_objects): + """Test writing multiple arrays.""" + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".h5") as f: + h5_path = f.name + + try: + epc = Epc(force_h5_path=h5_path) + trset = sample_objects["trset"] + + epc.add_object(trset) + + array1 = np.arange(10) + array2 = np.arange(20).reshape((4, 5)) + array3 = np.arange(12).reshape((3, 4)) + + epc.write_array(trset, "/array1", array1) + epc.write_array(trset, "/array2", array2) + epc.write_array(trset, "/array3", array3) + + # Read them back + assert np.array_equal(epc.read_array(trset, "/array1"), array1) + assert np.array_equal(epc.read_array(trset, "/array2"), array2) + assert np.array_equal(epc.read_array(trset, "/array3"), array3) + + finally: + import time + + time.sleep(0.1) + if os.path.exists(h5_path): + try: + os.unlink(h5_path) + except PermissionError: + pass + + +class TestExternalFilesHandling: + """Test handling of external files.""" + + def test_add_external_file_path(self): + """Test adding external file paths.""" + epc = Epc() + + epc.external_files_path.append("/path/to/external/file.h5") + epc.external_files_path.append("/path/to/another/file.h5") + + assert len(epc.external_files_path) == 2 + assert "/path/to/external/file.h5" in epc.external_files_path + + def test_force_h5_path(self): + """Test force_h5_path parameter.""" + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".h5") as f: + h5_path = f.name + + try: + epc = Epc(force_h5_path=h5_path) + assert epc.force_h5_path == h5_path + + finally: + if os.path.exists(h5_path): + try: + os.unlink(h5_path) + except PermissionError: + pass + + +class TestExportVersions: + """Test different export versions.""" + + def test_classic_export_version(self, temp_epc_file, sample_objects): + """Test export with CLASSIC version.""" + epc = Epc(export_version=EpcExportVersion.CLASSIC) + epc.add_object(sample_objects["bf"]) + + epc.export_file(temp_epc_file) + + assert os.path.exists(temp_epc_file) + + # Reload and verify + epc2 = Epc.read_file(temp_epc_file) + assert len(epc2) == 1 + + def test_expanded_export_version(self, temp_epc_file, sample_objects): + """Test export with EXPANDED version.""" + epc = Epc(export_version=EpcExportVersion.EXPANDED) + epc.add_object(sample_objects["bf"]) + epc.add_object(sample_objects["bfi"]) + + epc.export_file(temp_epc_file) + + assert os.path.exists(temp_epc_file) + + # Reload and verify + epc2 = Epc.read_file(temp_epc_file) + assert len(epc2) == 2 + + +class TestAdditionalRels: + """Test additional relationships handling.""" + + def test_add_rels_for_object(self, sample_objects): + """Test adding additional relationships for an object.""" + from energyml.opc.opc import Relationship + + epc = Epc() + bf = sample_objects["bf"] + epc.add_object(bf) + + identifier = get_obj_uri(bf) + + # Add external resource relationship + h5_rel = Relationship( + target="data/external.h5", + type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), + id=f"_external_{identifier}", + ) + + epc.add_rels_for_object(identifier, [h5_rel]) + + assert identifier in epc._rels_cache._supplemental_rels + assert len(epc._rels_cache._supplemental_rels[identifier]) == 1 + + def test_get_h5_file_paths(self, sample_objects): + """Test retrieving H5 file paths from relationships.""" + from energyml.opc.opc import Relationship + + epc = Epc() + trset = sample_objects["trset"] + epc.add_object(trset) + + identifier = get_obj_identifier(trset) + + # Add H5 relationships + h5_rel = Relationship( + target="data/geometry.h5", + type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), + id=f"_external_{identifier}_1", + ) + + epc.add_rels_for_object(identifier, [h5_rel]) + + h5_paths = epc.get_h5_file_paths(trset) + assert "data/geometry.h5" in h5_paths + + # Test persistance of additional relationships through export and reload + def test_additional_rels_persistence(self, temp_epc_file, sample_objects): + """Test that additional relationships persist through export and reload.""" + from energyml.opc.opc import Relationship + + epc = Epc() + bf = sample_objects["bf"] + epc.add_object(bf) + + identifier = get_obj_uri(bf) + + # Add external resource relationship + h5_rel = Relationship( + target="data/external.h5", + type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), + id=f"_external_{identifier}", + ) + + epc.add_rels_for_object(identifier, [h5_rel]) + + # Export + epc.export_file(temp_epc_file) + + # Reload + epc2 = Epc.read_file(temp_epc_file) + + # Check that additional rels are still present + assert identifier in epc2._rels_cache._supplemental_rels + assert len(epc2._rels_cache._supplemental_rels[identifier]) == 1 + assert epc2._rels_cache._supplemental_rels[identifier][0].target == "data/external.h5" + + def test_additional_rels_persistence_parallel(self, temp_epc_file, sample_objects): + """Test that additional relationships persist through export and reload with parallel reading.""" + from energyml.opc.opc import Relationship + + epc = Epc() + bf = sample_objects["bf"] + epc.add_object(bf) + + identifier = get_obj_uri(bf) + + # Add external resource relationship + h5_rel = Relationship( + target="data/external.h5", + type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), + id=f"_external_{identifier}", + ) + + epc.add_rels_for_object(identifier, [h5_rel]) + + # Export + epc.export_file(temp_epc_file) + + # Reload with parallel reading + epc2 = Epc.read_file(temp_epc_file, read_parallel=True) + + # Check that additional rels are still present + assert identifier in epc2._rels_cache._supplemental_rels + assert len(epc2._rels_cache._supplemental_rels[identifier]) == 1 + assert epc2._rels_cache._supplemental_rels[identifier][0].target == "data/external.h5" + + +class TestEdgeCases: + """Test edge cases and error handling.""" + + def test_empty_epc(self): + """Test operations on empty Epc.""" + epc = Epc() + + assert len(epc) == 0 + assert len(epc.energyml_objects) == 0 + + # Should be able to export empty epc + io = epc.export_io() + assert io is not None + + def test_remove_nonexistent_object(self): + """Test removing non-existent object.""" + epc = Epc() + + # Should not raise error + epc.remove_object("nonexistent-uuid.0") + assert len(epc) == 0 + + def test_get_nonexistent_object(self): + """Test getting non-existent object.""" + epc = Epc() + + result = epc.get_object_by_identifier("nonexistent-uuid.0") + assert result is None + + results = epc.get_object_by_uuid("nonexistent-uuid") + assert len(results) == 0 + + def test_duplicate_add(self, sample_objects): + """Test adding the same object multiple times.""" + epc = Epc() + bf = sample_objects["bf"] + + epc.add_object(bf) + epc.add_object(bf) # Add same object again + + # Behavior: object appears only once in the list + assert len(epc.energyml_objects) >= 1 + + +class TestListObjects: + """Test list_objects functionality.""" + + def test_list_objects(self, sample_objects): + """Test listing objects.""" + epc = Epc() + + epc.add_object(sample_objects["bf"]) + epc.add_object(sample_objects["bfi"]) + epc.add_object(sample_objects["trset"]) + + objects_list = epc.list_objects() + assert len(objects_list) == 3 + + def test_list_objects_empty(self): + """Test listing objects from empty Epc.""" + epc = Epc() + + objects_list = epc.list_objects() + assert len(objects_list) == 0 + + +class TestCoreProperties: + """Test core properties handling.""" + + def test_core_props_creation(self, temp_epc_file, sample_objects): + """Test that core properties are created during export.""" + epc = Epc() + epc.add_object(sample_objects["bf"]) + + epc.export_file(temp_epc_file) + + # Verify core props exist after export + assert epc.core_props is not None + + def test_custom_core_props(self, temp_epc_file, sample_objects): + """Test setting custom core properties.""" + from energyml.opc.opc import CoreProperties, Creator + + core_props = CoreProperties( + creator=Creator(any_element="Test Creator"), + ) + + epc = Epc(core_props=core_props) + epc.add_object(sample_objects["bf"]) + + epc.export_file(temp_epc_file) + + # Reload and verify + epc2 = Epc.read_file(temp_epc_file) + assert epc2.core_props is not None + + def test_custom_core_props_parallel(self, temp_epc_file, sample_objects): + """Test setting custom core properties.""" + from energyml.opc.opc import CoreProperties, Creator + + core_props = CoreProperties( + creator=Creator(any_element="Test Creator"), + ) + + epc = Epc(core_props=core_props) + epc.add_object(sample_objects["bf"]) + + epc.export_file(temp_epc_file) + + # Reload and verify + epc2 = Epc.read_file(temp_epc_file, read_parallel=True) + assert epc2.core_props is not None + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/energyml-utils/tests/test_epc_rels_cache.py b/energyml-utils/tests/test_epc_rels_cache.py new file mode 100644 index 0000000..51fcaae --- /dev/null +++ b/energyml-utils/tests/test_epc_rels_cache.py @@ -0,0 +1,929 @@ +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 +""" +Comprehensive unit tests for EpcRelsCache class functionality. + +Tests cover: +1. Basic relationship computation with forward and reverse relationships +2. Incremental updates (update_cache_for_object) +3. Late-arrival scenario (object referenced before it exists) +4. Parallel vs sequential computation +5. Reverse index functionality +6. Validation (duplicates, orphaned references) +7. Object removal from cache +8. Cache stats and debugging utilities +""" + +import pytest +from typing import Set + +from energyml.eml.v2_0.commonv2 import Citation as Citation20, EpcExternalPartReference +from energyml.eml.v2_3.commonv2 import Citation +from energyml.resqml.v2_0_1.resqmlv2 import ( + TriangulatedSetRepresentation as TriangulatedSetRepresentation20, + TrianglePatch as TrianglePatch20, + PointGeometry as PointGeometry20, +) +from energyml.resqml.v2_2.resqmlv2 import ( + BoundaryFeature, + HorizonInterpretation, +) + +from energyml.utils.epc import ( + EpcRelsCache, + EnergymlObjectCollection, + EpcRelsCacheErrorPolicy, + as_dor, + EpcExportVersion, +) +from energyml.utils.epc_utils import gen_energyml_object_path, gen_energyml_object_path, relationships_equal +from energyml.utils.introspection import ( + epoch_to_date, + epoch, + get_obj_uri, +) +from energyml.utils.constants import EPCRelsRelationshipType + + +@pytest.fixture +def sample_objects(): + """Create sample EnergyML objects for testing.""" + # Create a BoundaryFeature + bf = BoundaryFeature( + citation=Citation( + title="Test Boundary Feature", + originator="Test", + creation=epoch_to_date(epoch()), + ), + uuid="25773477-ffee-4cc2-867d-000000000001", + object_version="1.0", + ) + + # Create a HorizonInterpretation + horizon_interp = HorizonInterpretation( + citation=Citation( + title="Test HorizonInterpretation", + originator="Test", + creation=epoch_to_date(epoch()), + ), + interpreted_feature=as_dor(bf), + uuid="25773477-ffee-4cc2-867d-000000000003", + object_version="1.0", + ) + + # EpcExternalPartReference + external_ref = EpcExternalPartReference( + uuid="25773477-ffee-4cc2-867d-000000000005", + citation=Citation20(title="An external reference", originator="Test", creation=epoch_to_date(epoch())), + ) + + # TriangulatedSetRepresentation (2.0.1) with references + trset20 = TriangulatedSetRepresentation20( + citation=Citation20( + title="Test TriangulatedSetRepresentation 2.0", originator="Test", creation=epoch_to_date(epoch()) + ), + uuid="25773477-ffee-4cc2-867d-000000000006", + object_version="1.0", + represented_interpretation=as_dor(horizon_interp, "eml20.DataObjectReference"), + triangle_patch=[ + TrianglePatch20(geometry=PointGeometry20(local_crs=as_dor(external_ref, "eml20.DataObjectReference"))) + ], + ) + + return { + "bf": bf, + "horizon_interp": horizon_interp, + "external_ref": external_ref, + "trset20": trset20, + } + + +class TestEpcRelsCacheBasics: + """Test basic EpcRelsCache initialization and functionality.""" + + def test_initialization_with_collection(self, sample_objects): + """Test initializing cache with EnergymlObjectCollection.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC, EpcRelsCacheErrorPolicy.LOG) + + assert cache is not None + assert len(collection) == 2 + + def test_initialization_with_error_policy_string(self, sample_objects): + """Test backward compatibility with string error policy.""" + collection = EnergymlObjectCollection([sample_objects["bf"]]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC, "log") + + assert cache._error_policy == EpcRelsCacheErrorPolicy.LOG + + def test_uri_from_any_with_object(self, sample_objects): + """Test _uri_from_any with various input types.""" + collection = EnergymlObjectCollection([sample_objects["bf"]]) + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + + bf = sample_objects["bf"] + uri = cache._uri_from_any(bf) + + assert uri == get_obj_uri(bf) + + def test_uri_from_any_with_uri(self, sample_objects): + """Test _uri_from_any with Uri object.""" + collection = EnergymlObjectCollection([sample_objects["bf"]]) + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + + bf = sample_objects["bf"] + expected_uri = get_obj_uri(bf) + + uri = cache._uri_from_any(expected_uri) + + assert uri == expected_uri + + +class TestRelationshipComputation: + """Test relationship computation with forward and reverse rels.""" + + def test_compute_rels_basic(self, sample_objects): + """Test basic relationship computation.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + rels_dict = cache.compute_rels() + + assert rels_dict is not None + assert len(rels_dict) >= 2 + + def test_trset_has_destination_to_horizon(self, sample_objects): + """Test that trset has DESTINATION_OBJECT rel to horizonInterp.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + collection.append(sample_objects["trset20"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + trset_uri = get_obj_uri(sample_objects["trset20"]) + trset_rels = cache.get_object_rels(trset_uri) + + # Check for DESTINATION_OBJECT relationship + dest_rels = [r for r in trset_rels if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT)] + assert len(dest_rels) >= 1 + + # Verify target is horizon_interp + horizon_path = gen_energyml_object_path(sample_objects["horizon_interp"], EpcExportVersion.CLASSIC) + assert any(horizon_path in r.target for r in dest_rels) + + def test_trset_has_ml_to_external_part_proxy(self, sample_objects): + """Test that trset has ML_TO_EXTERNAL_PART_PROXY to external_ref.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + collection.append(sample_objects["external_ref"]) + collection.append(sample_objects["trset20"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + trset_uri = get_obj_uri(sample_objects["trset20"]) + trset_rels = cache.get_object_rels(trset_uri) + + # Check for ML_TO_EXTERNAL_PART_PROXY relationship + ml_to_proxy_rels = [ + r for r in trset_rels if r.type_value == str(EPCRelsRelationshipType.ML_TO_EXTERNAL_PART_PROXY) + ] + assert len(ml_to_proxy_rels) >= 1 + + # Verify target is external_ref + external_path = gen_energyml_object_path(sample_objects["external_ref"], EpcExportVersion.CLASSIC) + assert any(external_path in r.target for r in ml_to_proxy_rels) + + def test_external_ref_has_proxy_to_ml(self, sample_objects): + """Test that external_ref has EXTERNAL_PART_PROXY_TO_ML to trset (reverse rel).""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + collection.append(sample_objects["external_ref"]) + collection.append(sample_objects["trset20"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + external_uri = get_obj_uri(sample_objects["external_ref"]) + external_rels = cache.get_object_rels(external_uri) + + # Check for EXTERNAL_PART_PROXY_TO_ML relationship (reverse rel) + proxy_to_ml_rels = [ + r for r in external_rels if r.type_value == str(EPCRelsRelationshipType.EXTERNAL_PART_PROXY_TO_ML) + ] + assert len(proxy_to_ml_rels) >= 1 + + # Verify target is trset20 + trset_path = gen_energyml_object_path(sample_objects["trset20"], EpcExportVersion.CLASSIC) + assert any(trset_path in r.target for r in proxy_to_ml_rels) + + def test_horizon_has_source_from_trset(self, sample_objects): + """Test that horizonInterp has SOURCE_OBJECT from trset (reverse rel).""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + collection.append(sample_objects["trset20"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + horizon_uri = get_obj_uri(sample_objects["horizon_interp"]) + horizon_rels = cache.get_object_rels(horizon_uri) + + # Check for SOURCE_OBJECT relationship (reverse rel from trset) + source_rels = [r for r in horizon_rels if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] + assert len(source_rels) >= 1 + + # Verify source is trset20 + trset_path = gen_energyml_object_path(sample_objects["trset20"], EpcExportVersion.CLASSIC) + assert any(trset_path in r.target for r in source_rels) + + def test_compute_rels_parallel(self, sample_objects): + """Test parallel relationship computation with detailed verification.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + collection.append(sample_objects["external_ref"]) + collection.append(sample_objects["trset20"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + rels_dict = cache.compute_rels(parallel=True) + + assert rels_dict is not None + assert len(rels_dict) >= 4 + + # Verify trset relationships in detail + trset_uri = get_obj_uri(sample_objects["trset20"]) + trset_rels = cache.get_object_rels(trset_uri) + assert len(trset_rels) >= 2 # At least horizon_interp and external_ref + + # Verify each relationship has correct type and target + horizon_path = gen_energyml_object_path(sample_objects["horizon_interp"], EpcExportVersion.CLASSIC) + external_path = gen_energyml_object_path(sample_objects["external_ref"], EpcExportVersion.CLASSIC) + + # Find DESTINATION_OBJECT rel to horizon_interp + dest_rels = [ + r + for r in trset_rels + if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT) and horizon_path in r.target + ] + assert len(dest_rels) == 1, f"Expected 1 DESTINATION_OBJECT to horizon, found {len(dest_rels)}" + + # Find ML_TO_EXTERNAL_PART_PROXY rel to external_ref + ml_to_proxy_rels = [ + r + for r in trset_rels + if r.type_value == str(EPCRelsRelationshipType.ML_TO_EXTERNAL_PART_PROXY) and external_path in r.target + ] + assert ( + len(ml_to_proxy_rels) == 1 + ), f"Expected 1 ML_TO_EXTERNAL_PART_PROXY to external_ref, found {len(ml_to_proxy_rels)}" + + # Verify horizon_interp has SOURCE_OBJECT from trset (reverse rel) + horizon_uri = get_obj_uri(sample_objects["horizon_interp"]) + horizon_rels = cache.get_object_rels(horizon_uri) + + trset_path = gen_energyml_object_path(sample_objects["trset20"], EpcExportVersion.CLASSIC) + source_rels = [ + r + for r in horizon_rels + if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT) and trset_path in r.target + ] + assert len(source_rels) >= 1, "Expected at least 1 SOURCE_OBJECT from trset to horizon" + + # Verify external_ref has EXTERNAL_PART_PROXY_TO_ML from trset (reverse rel) + external_uri = get_obj_uri(sample_objects["external_ref"]) + external_rels = cache.get_object_rels(external_uri) + + proxy_to_ml_rels = [ + r + for r in external_rels + if r.type_value == str(EPCRelsRelationshipType.EXTERNAL_PART_PROXY_TO_ML) and trset_path in r.target + ] + assert len(proxy_to_ml_rels) >= 1, "Expected at least 1 EXTERNAL_PART_PROXY_TO_ML from trset to external_ref" + + def test_compute_rels_sequential_vs_parallel(self, sample_objects): + """Test that sequential and parallel computation produce identical results.""" + collection1 = EnergymlObjectCollection() + collection2 = EnergymlObjectCollection() + + for obj in [ + sample_objects["bf"], + sample_objects["horizon_interp"], + sample_objects["external_ref"], + sample_objects["trset20"], + ]: + collection1.append(obj) + collection2.append(obj) + + cache_seq = EpcRelsCache(collection1, EpcExportVersion.CLASSIC) + cache_par = EpcRelsCache(collection2, EpcExportVersion.CLASSIC) + + rels_seq = cache_seq.compute_rels(parallel=False) + rels_par = cache_par.compute_rels(parallel=True) + + # Same number of objects should have rels + assert len(rels_seq) == len( + rels_par + ), f"Different number of objects with rels: seq={len(rels_seq)}, par={len(rels_par)}" + + # Verify each object has the same number of relationships + for obj in [ + sample_objects["bf"], + sample_objects["horizon_interp"], + sample_objects["external_ref"], + sample_objects["trset20"], + ]: + obj_uri = get_obj_uri(obj) + seq_rels = cache_seq.get_object_rels(obj_uri) + par_rels = cache_par.get_object_rels(obj_uri) + + assert len(seq_rels) == len(par_rels), ( + f"Object {obj_uri} has different number of rels: " f"seq={len(seq_rels)}, par={len(par_rels)}" + ) + + # Verify each relationship from parallel is present in sequential + for par_rel in par_rels: + # Check if this relationship exists in sequential results + matching_rels = [seq_rel for seq_rel in seq_rels if relationships_equal(par_rel, seq_rel)] + assert len(matching_rels) > 0, ( + f"Relationship from parallel not found in sequential: " + f"target={par_rel.target}, type={par_rel.type_value}" + ) + + # Verify each relationship from sequential is present in parallel + for seq_rel in seq_rels: + matching_rels = [par_rel for par_rel in par_rels if relationships_equal(seq_rel, par_rel)] + assert len(matching_rels) > 0, ( + f"Relationship from sequential not found in parallel: " + f"target={seq_rel.target}, type={seq_rel.type_value}" + ) + + +class TestLateArrivalScenario: + """Test the critical late-arrival scenario where object B is added after A references it.""" + + def test_add_objects_out_of_order(self, sample_objects): + """Test adding trset before horizon_interp exists, then adding horizon_interp.""" + collection = EnergymlObjectCollection() + + # Add trset FIRST (references horizon_interp which doesn't exist yet) + collection.append(sample_objects["trset20"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + # trset should have forward rel to horizon_interp + trset_uri = get_obj_uri(sample_objects["trset20"]) + trset_rels = cache.get_object_rels(trset_uri) + assert len(trset_rels) >= 2 # To horizon and external_ref + + # NOW add horizon_interp and external_ref + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + collection.append(sample_objects["external_ref"]) + + # Recompute + cache.compute_rels() + + # horizon_interp should NOW have reverse rel from trset + horizon_uri = get_obj_uri(sample_objects["horizon_interp"]) + horizon_rels = cache.get_object_rels(horizon_uri) + + source_rels = [r for r in horizon_rels if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] + assert len(source_rels) >= 1 + + # Verify reverse rel points to trset + trset_path = gen_energyml_object_path(sample_objects["trset20"], EpcExportVersion.CLASSIC) + assert any(trset_path in r.target for r in source_rels) + + def test_incremental_update_with_late_arrival(self, sample_objects): + """Test update_cache_for_object with late-arriving referenced object.""" + collection = EnergymlObjectCollection() + + # Add trset first + collection.append(sample_objects["trset20"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + # Add horizon_interp later + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + + # Use incremental update + cache.update_cache_for_object(sample_objects["horizon_interp"]) + + # horizon_interp should have reverse rel from trset + horizon_uri = get_obj_uri(sample_objects["horizon_interp"]) + horizon_rels = cache.get_object_rels(horizon_uri) + + source_rels = [r for r in horizon_rels if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] + assert len(source_rels) >= 1 + + def test_multiple_order_scenarios(self, sample_objects): + """Test various object addition orders all produce correct results.""" + orders = [ + ["bf", "horizon_interp", "external_ref", "trset20"], # Normal order + ["trset20", "external_ref", "horizon_interp", "bf"], # Reverse order + ["external_ref", "trset20", "bf", "horizon_interp"], # Mixed order + ] + + for order in orders: + collection = EnergymlObjectCollection() + + for obj_name in order: + collection.append(sample_objects[obj_name]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + # Verify all relationships are correct regardless of order + trset_uri = get_obj_uri(sample_objects["trset20"]) + trset_rels = cache.get_object_rels(trset_uri) + assert len(trset_rels) >= 2 + + horizon_uri = get_obj_uri(sample_objects["horizon_interp"]) + horizon_rels = cache.get_object_rels(horizon_uri) + source_rels = [r for r in horizon_rels if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] + assert len(source_rels) >= 1 + + +class TestIncrementalUpdates: + """Test incremental cache updates.""" + + def test_update_cache_for_single_object(self, sample_objects): + """Test updating cache for a single object.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + # Add new object and update incrementally + collection.append(sample_objects["trset20"]) + cache.update_cache_for_object(sample_objects["trset20"]) + + trset_uri = get_obj_uri(sample_objects["trset20"]) + trset_rels = cache.get_object_rels(trset_uri) + + assert len(trset_rels) >= 1 # At least horizon_interp + + def test_update_propagates_reverse_rels(self, sample_objects): + """Test that updating an object propagates reverse rels to targets.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + # Get initial horizon rels count + horizon_uri = get_obj_uri(sample_objects["horizon_interp"]) + initial_rels = cache.get_object_rels(horizon_uri) + initial_count = len(initial_rels) + + # Add trset and update + collection.append(sample_objects["trset20"]) + cache.update_cache_for_object(sample_objects["trset20"]) + + # horizon should now have reverse rel from trset + updated_rels = cache.get_object_rels(horizon_uri) + assert len(updated_rels) > initial_count + + def test_clear_cache(self, sample_objects): + """Test clearing the cache.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + # Verify cache has data + horizon_uri = get_obj_uri(sample_objects["horizon_interp"]) + assert len(cache.get_object_rels(horizon_uri)) > 0 + + # Clear cache + cache.clear_cache() + + # Cache should be empty + assert len(cache.get_object_rels(horizon_uri)) == 0 + + def test_recompute_cache(self, sample_objects): + """Test recomputing the entire cache.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + # Add new object + collection.append(sample_objects["trset20"]) + + # Recompute entire cache + cache.recompute_cache() + + trset_uri = get_obj_uri(sample_objects["trset20"]) + trset_rels = cache.get_object_rels(trset_uri) + assert len(trset_rels) >= 1 + + +class TestReverseIndex: + """Test reverse reference index functionality.""" + + def test_reverse_index_built_during_compute(self, sample_objects): + """Test that reverse index is built during compute_rels.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + collection.append(sample_objects["trset20"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + # Check reverse index exists + assert cache._reverse_index is not None + assert len(cache._reverse_index) > 0 + + # horizon_interp should be in reverse index (referenced by trset) + horizon_uri = get_obj_uri(sample_objects["horizon_interp"]) + assert horizon_uri in cache._reverse_index + + # trset should be in the sources for horizon + trset_uri = get_obj_uri(sample_objects["trset20"]) + assert trset_uri in cache._reverse_index[horizon_uri] + + def test_reverse_index_stats(self, sample_objects): + """Test get_reverse_index_stats method.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + collection.append(sample_objects["external_ref"]) + collection.append(sample_objects["trset20"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + stats = cache.get_reverse_index_stats() + + assert stats is not None + assert "total_targets" in stats + assert "total_references" in stats + assert "max_references_to_single_target" in stats + assert stats["total_targets"] > 0 + assert stats["total_references"] > 0 + + def test_reverse_index_cleared(self, sample_objects): + """Test that reverse index is cleared with cache.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + assert len(cache._reverse_index) > 0 + + cache.clear_cache() + + assert len(cache._reverse_index) == 0 + + +class TestSupplementalRels: + """Test supplemental relationships functionality.""" + + def test_add_supplemental_rels(self, sample_objects): + """Test adding supplemental relationships.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + # Create and add supplemental rel + from energyml.opc.opc import Relationship + + supplemental_rel = Relationship(target="test_target.xml", type_value="test_type", id="test_id") + + bf_uri = get_obj_uri(sample_objects["bf"]) + cache.add_supplemental_rels(sample_objects["bf"], supplemental_rel) + + # Get rels should include supplemental + rels = cache.get_object_rels(bf_uri) + assert any(r.target == "test_target.xml" for r in rels) + + def test_supplemental_rels_persist_across_clear(self, sample_objects): + """Test that supplemental rels persist across clear_cache.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + # Add supplemental rel + from energyml.opc.opc import Relationship + + supplemental_rel = Relationship(target="test_target.xml", type_value="test_type", id="test_id") + + cache.add_supplemental_rels(sample_objects["bf"], supplemental_rel) + + # Clear computed rels + cache.clear_cache() + + # Supplemental should still be there + bf_uri = get_obj_uri(sample_objects["bf"]) + rels = cache.get_object_rels(bf_uri) + assert any(r.target == "test_target.xml" for r in rels) + + +class TestValidation: + """Test validation functionality.""" + + def test_validate_rels_no_issues(self, sample_objects): + """Test validation with no issues.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + report = cache.validate_rels() + + assert report is not None + assert "duplicates" in report + assert "orphaned_references" in report + assert len(report["duplicates"]) == 0 + + def test_validate_detects_orphaned_references(self, sample_objects): + """Test that validation detects orphaned references.""" + collection = EnergymlObjectCollection() + # Only add trset (references horizon which doesn't exist) + collection.append(sample_objects["trset20"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + report = cache.validate_rels() + + # Should detect orphaned references to horizon_interp and external_ref + assert len(report["orphaned_references"]) > 0 + + def test_clean_rels_removes_duplicates(self, sample_objects): + """Test that clean_rels removes duplicate relationships.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + + # Manually add duplicate rels + bf_uri = get_obj_uri(sample_objects["bf"]) + from energyml.opc.opc import Relationship + + cache._computed_rels[bf_uri] = [ + Relationship(target="test.xml", type_value="type1", id="id1"), + Relationship(target="test.xml", type_value="type1", id="id2"), # Duplicate + ] + + # Clean should remove duplicate + cache.clean_rels(sample_objects["bf"]) + + rels = cache.get_object_rels(bf_uri) + assert len(rels) == 1 + + +class TestObjectRemoval: + """Test object removal from cache.""" + + def test_remove_object_from_cache(self, sample_objects): + """Test _remove_object_from_cache method.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + collection.append(sample_objects["trset20"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + # Verify horizon has rels + horizon_uri = get_obj_uri(sample_objects["horizon_interp"]) + assert len(cache.get_object_rels(horizon_uri)) > 0 + + # Remove horizon from cache + cache._remove_object_from_cache(sample_objects["horizon_interp"]) + + # horizon should have no rels + assert len(cache.get_object_rels(horizon_uri)) == 0 + + # Reverse index should not have horizon + assert horizon_uri not in cache._reverse_index + + def test_remove_cleans_reverse_index(self, sample_objects): + """Test that removal cleans up reverse index entries.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + collection.append(sample_objects["trset20"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + # trset should be in reverse index for horizon + horizon_uri = get_obj_uri(sample_objects["horizon_interp"]) + trset_uri = get_obj_uri(sample_objects["trset20"]) + assert trset_uri in cache._reverse_index.get(horizon_uri, set()) + + # Remove trset + cache._remove_object_from_cache(sample_objects["trset20"]) + + # trset should no longer be in reverse index for horizon + if horizon_uri in cache._reverse_index: + assert trset_uri not in cache._reverse_index[horizon_uri] + + +class TestErrorHandling: + """Test error handling with different policies.""" + + def test_error_policy_log(self, sample_objects): + """Test LOG error policy (should not raise).""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC, EpcRelsCacheErrorPolicy.LOG) + + # This should not raise even with invalid input + try: + cache._handle_error("Test error") + # Should not raise + except: + pytest.fail("LOG policy should not raise exceptions") + + def test_error_policy_raise(self, sample_objects): + """Test RAISE error policy (should raise).""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC, EpcRelsCacheErrorPolicy.RAISE) + + # This should raise + with pytest.raises(RuntimeError): + cache._handle_error("Test error") + + def test_error_policy_skip(self, sample_objects): + """Test SKIP error policy (should do nothing).""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC, EpcRelsCacheErrorPolicy.SKIP) + + # This should do nothing + try: + cache._handle_error("Test error") + # Should not raise + except: + pytest.fail("SKIP policy should not raise exceptions") + + +class TestDeduplication: + """Test relationship deduplication.""" + + def test_deduplicate_rels(self, sample_objects): + """Test _deduplicate_rels method.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + + from energyml.opc.opc import Relationship + + rels = [ + Relationship(target="test1.xml", type_value="type1", id="id1"), + Relationship(target="test1.xml", type_value="type1", id="id2"), # Duplicate + Relationship(target="test2.xml", type_value="type2", id="id3"), + ] + + deduped = cache._deduplicate_rels(rels) + + assert len(deduped) == 2 # Only 2 unique rels + + def test_get_object_rels_returns_deduplicated(self, sample_objects): + """Test that get_object_rels returns deduplicated results.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + horizon_uri = get_obj_uri(sample_objects["horizon_interp"]) + rels = cache.get_object_rels(horizon_uri) + + # Check no duplicates + seen = set() + for rel in rels: + key = (rel.target, rel.type_value) + assert key not in seen, "Found duplicate relationship" + seen.add(key) + + +class TestComplexScenarios: + """Test complex real-world scenarios.""" + + def test_full_workflow(self, sample_objects): + """Test complete workflow: add, compute, update, validate.""" + collection = EnergymlObjectCollection() + + # Step 1: Add initial objects + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + + # Step 2: Compute rels + cache.compute_rels() + + # Step 3: Add more objects incrementally + collection.append(sample_objects["external_ref"]) + cache.update_cache_for_object(sample_objects["external_ref"]) + + collection.append(sample_objects["trset20"]) + cache.update_cache_for_object(sample_objects["trset20"]) + + # Step 4: Validate + report = cache.validate_rels() + assert len(report["duplicates"]) == 0 + + # Step 5: Get stats + stats = cache.get_reverse_index_stats() + assert stats["total_targets"] > 0 + + # Step 6: Verify all relationships are correct + trset_uri = get_obj_uri(sample_objects["trset20"]) + trset_rels = cache.get_object_rels(trset_uri) + assert len(trset_rels) >= 2 + + def test_parallel_then_incremental(self, sample_objects): + """Test parallel compute followed by incremental updates.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + collection.append(sample_objects["horizon_interp"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + + # Parallel compute + cache.compute_rels(parallel=True) + + # Add object and update incrementally + collection.append(sample_objects["trset20"]) + cache.update_cache_for_object(sample_objects["trset20"]) + + # Verify correct + trset_uri = get_obj_uri(sample_objects["trset20"]) + horizon_uri = get_obj_uri(sample_objects["horizon_interp"]) + + trset_rels = cache.get_object_rels(trset_uri) + horizon_rels = cache.get_object_rels(horizon_uri) + + assert len(trset_rels) >= 1 + assert len(horizon_rels) >= 1 + + def test_recompute_after_many_updates(self, sample_objects): + """Test full recompute after many incremental updates.""" + collection = EnergymlObjectCollection() + collection.append(sample_objects["bf"]) + + cache = EpcRelsCache(collection, EpcExportVersion.CLASSIC) + cache.compute_rels() + + # Many incremental updates + collection.append(sample_objects["horizon_interp"]) + cache.update_cache_for_object(sample_objects["horizon_interp"]) + + collection.append(sample_objects["external_ref"]) + cache.update_cache_for_object(sample_objects["external_ref"]) + + collection.append(sample_objects["trset20"]) + cache.update_cache_for_object(sample_objects["trset20"]) + + # Full recompute + cache.recompute_cache(parallel=False) + + # Verify still correct + report = cache.validate_rels() + assert len(report["duplicates"]) == 0 + + stats = cache.get_reverse_index_stats() + assert stats["total_targets"] > 0 diff --git a/energyml-utils/tests/test_epc_stream.py b/energyml-utils/tests/test_epc_stream.py index f22824c..2233de3 100644 --- a/energyml-utils/tests/test_epc_stream.py +++ b/energyml-utils/tests/test_epc_stream.py @@ -12,13 +12,18 @@ """ import os import tempfile -import zipfile -from pathlib import Path +from energyml.utils.epc_utils import gen_energyml_object_path import pytest import numpy as np -from energyml.eml.v2_3.commonv2 import Citation, DataObjectReference +from energyml.eml.v2_0.commonv2 import Citation as Citation20, EpcExternalPartReference +from energyml.eml.v2_3.commonv2 import Citation +from energyml.resqml.v2_0_1.resqmlv2 import ( + TriangulatedSetRepresentation as TriangulatedSetRepresentation20, + TrianglePatch as TrianglePatch20, + PointGeometry as PointGeometry20, +) from energyml.resqml.v2_2.resqmlv2 import ( TriangulatedSetRepresentation, BoundaryFeatureInterpretation, @@ -28,7 +33,7 @@ from energyml.opc.opc import Relationships from energyml.utils.epc_stream import EpcStreamReader, RelsUpdateMode -from energyml.utils.epc import create_energyml_object, as_dor, get_obj_identifier +from energyml.utils.epc_utils import as_dor, get_obj_identifier from energyml.utils.introspection import ( epoch_to_date, epoch, @@ -65,7 +70,7 @@ def sample_objects(): originator="Test", creation=epoch_to_date(epoch()), ), - uuid=gen_uuid(), + uuid="25773477-ffee-4cc2-867d-000000000001", object_version="1.0", ) @@ -76,11 +81,24 @@ def sample_objects(): originator="Test", creation=epoch_to_date(epoch()), ), - uuid=gen_uuid(), + uuid="25773477-ffee-4cc2-867d-000000000002", object_version="1.0", interpreted_feature=as_dor(bf), ) + # Create a HorizonInterpretation (independent object) + horizon_interp = HorizonInterpretation( + citation=Citation( + title="Test HorizonInterpretation", + originator="Test", + creation=epoch_to_date(epoch()), + ), + interpreted_feature=as_dor(bf), + uuid="25773477-ffee-4cc2-867d-000000000003", + object_version="1.0", + # domain="depth", + ) + # Create a TriangulatedSetRepresentation trset = TriangulatedSetRepresentation( citation=Citation( @@ -88,27 +106,42 @@ def sample_objects(): originator="Test", creation=epoch_to_date(epoch()), ), - uuid=gen_uuid(), + uuid="25773477-ffee-4cc2-867d-000000000004", object_version="1.0", - represented_object=as_dor(bfi), + represented_object=as_dor(horizon_interp), ) - # Create a HorizonInterpretation (independent object) - horizon_interp = HorizonInterpretation( - citation=Citation( - title="Test HorizonInterpretation", + # Create an EpcExternalPartReference (RESQML 2.0.1) + external_ref = EpcExternalPartReference( + uuid="25773477-ffee-4cc2-867d-000000000005", + citation=Citation20( + title="An external reference", + originator="Test", + creation=epoch_to_date(epoch()), + ), + ) + + # Create a TriangulatedSetRepresentation 2.0.1 that references the external part + trset20 = TriangulatedSetRepresentation20( + citation=Citation20( + title="Test TriangulatedSetRepresentation 2.0", originator="Test", creation=epoch_to_date(epoch()), ), - uuid=gen_uuid(), + uuid="25773477-ffee-4cc2-867d-000000000006", object_version="1.0", - domain="depth", + represented_interpretation=as_dor(horizon_interp, "eml20.DataObjectReference"), + triangle_patch=[ + TrianglePatch20(geometry=PointGeometry20(local_crs=as_dor(external_ref, "eml20.DataObjectReference"))) + ], ) return { "bf": bf, "bfi": bfi, "trset": trset, + "trset20": trset20, + "external_ref": external_ref, "horizon_interp": horizon_interp, } @@ -139,21 +172,25 @@ def test_manual_mode_no_auto_rebuild(self, temp_epc_file, sample_objects): # Basic rels should exist (from _add_object_to_file) bfi_rels = reader2.get_obj_rels(get_obj_identifier(bfi)) - assert len(bfi_rels) > 0 # Should have SOURCE rels + assert len(bfi_rels) == 0, "Expected no relationships in MANUAL mode without explicit rebuild" reader2.close() - def test_update_on_close_mode(self, temp_epc_file, sample_objects): - """Test that UPDATE_ON_CLOSE mode rebuilds rels on close.""" - reader = EpcStreamReader(temp_epc_file, rels_update_mode=RelsUpdateMode.UPDATE_ON_CLOSE) + def test_update_on_close_mode_sequential(self, temp_epc_file, sample_objects): + """Test that UPDATE_ON_CLOSE mode rebuilds rels on close (sequential processing).""" + reader = EpcStreamReader( + temp_epc_file, rels_update_mode=RelsUpdateMode.UPDATE_ON_CLOSE, enable_parallel_rels=False + ) bf = sample_objects["bf"] bfi = sample_objects["bfi"] + horizon_interp = sample_objects["horizon_interp"] trset = sample_objects["trset"] - # Add objects + # Add objects (including horizon_interp that trset references) reader.add_object(bf) reader.add_object(bfi) + reader.add_object(horizon_interp) reader.add_object(trset) # Before closing, rels may not be complete @@ -162,15 +199,190 @@ def test_update_on_close_mode(self, temp_epc_file, sample_objects): # Reopen and verify relationships were built reader2 = EpcStreamReader(temp_epc_file) - # Check that bfi has a SOURCE relationship to bf + # Check that bfi has a DEST relationship to bf + bfi_rels = reader2.get_obj_rels(get_obj_identifier(bfi)) + dest_rels = [r for r in bfi_rels if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT)] + assert len(dest_rels) == 1, "Expected DESTINATION relationship from bfi to bf" + + # Check that bf has SOURCE relationships from bfi and horizon_interp + bf_rels = reader2.get_obj_rels(get_obj_identifier(bf)) + source_rels = [r for r in bf_rels if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] + assert len(source_rels) == 2, "Expected 2 SOURCE relationships in bf rels (from bfi and horizon_interp)" + + # Check that horizon_interp has a SOURCE relationship from trset + hi_rels = reader2.get_obj_rels(get_obj_identifier(horizon_interp)) + hi_source_rels = [r for r in hi_rels if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] + assert len(hi_source_rels) == 1, "Expected SOURCE relationship in horizon_interp rels from trset" + + # Check that trset has a DESTINATION relationship to horizon_interp + trset_rels = reader2.get_obj_rels(get_obj_identifier(trset)) + dest_rels = [r for r in trset_rels if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT)] + assert len(dest_rels) == 1, "Expected DESTINATION relationship in trset rels targeting horizon_interp" + + # Close to release file handles (important on Windows) + reader2.close() + + def test_update_on_close_mode_parallel(self, temp_epc_file, sample_objects): + """Test that UPDATE_ON_CLOSE mode rebuilds rels on close (parallel processing).""" + reader = EpcStreamReader( + temp_epc_file, rels_update_mode=RelsUpdateMode.UPDATE_ON_CLOSE, enable_parallel_rels=True + ) + + bf = sample_objects["bf"] + bfi = sample_objects["bfi"] + horizon_interp = sample_objects["horizon_interp"] + trset = sample_objects["trset"] + + # Add objects (including horizon_interp that trset references) + reader.add_object(bf) + reader.add_object(bfi) + reader.add_object(horizon_interp) + reader.add_object(trset) + + # Before closing, rels may not be complete + reader.close() + + # Reopen and verify relationships were built + reader2 = EpcStreamReader(temp_epc_file) + + # Check that bfi has a DEST relationship to bf + bfi_rels = reader2.get_obj_rels(get_obj_identifier(bfi)) + dest_rels = [r for r in bfi_rels if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT)] + assert len(dest_rels) == 1, "Expected DESTINATION relationship from bfi to bf" + + # Check that bf has SOURCE relationships from bfi and horizon_interp + bf_rels = reader2.get_obj_rels(get_obj_identifier(bf)) + source_rels = [r for r in bf_rels if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] + assert len(source_rels) == 2, "Expected 2 SOURCE relationships in bf rels (from bfi and horizon_interp)" + + # Check that horizon_interp has a SOURCE relationship from trset + hi_rels = reader2.get_obj_rels(get_obj_identifier(horizon_interp)) + hi_source_rels = [r for r in hi_rels if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] + assert len(hi_source_rels) == 1, "Expected SOURCE relationship in horizon_interp rels from trset" + + # Check that trset has a DESTINATION relationship to horizon_interp + trset_rels = reader2.get_obj_rels(get_obj_identifier(trset)) + dest_rels = [r for r in trset_rels if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT)] + assert len(dest_rels) == 1, "Expected DESTINATION relationship in trset rels targeting horizon_interp" + + # Close to release file handles (important on Windows) + reader2.close() + + def test_update_on_close_mode_metadata_before_close_sequential(self, temp_epc_file, sample_objects): + """Test that UPDATE_ON_CLOSE mode updates metadata immediately but delays rels until close (sequential).""" + reader = EpcStreamReader( + temp_epc_file, rels_update_mode=RelsUpdateMode.UPDATE_ON_CLOSE, enable_parallel_rels=False + ) + + bf = sample_objects["bf"] + bfi = sample_objects["bfi"] + horizon_interp = sample_objects["horizon_interp"] + trset = sample_objects["trset"] + + # Add objects + reader.add_object(bf) + reader.add_object(bfi) + reader.add_object(horizon_interp) + reader.add_object(trset) + + # BEFORE closing, verify metadata is updated + objects_list = reader.list_objects() + assert len(objects_list) == 4, "Expected 4 objects in metadata before close" + assert len(reader) == 4, "Expected length of reader to be 4 before close" + + # BEFORE closing, verify NO relationships exist yet (UPDATE_ON_CLOSE behavior) + bfi_rels = reader.get_obj_rels(get_obj_identifier(bfi)) + assert len(bfi_rels) == 0, "Expected no relationships before close in UPDATE_ON_CLOSE mode" + + bf_rels = reader.get_obj_rels(get_obj_identifier(bf)) + assert len(bf_rels) == 0, "Expected no relationships before close in UPDATE_ON_CLOSE mode" + + hi_rels = reader.get_obj_rels(get_obj_identifier(horizon_interp)) + assert len(hi_rels) == 0, "Expected no relationships before close in UPDATE_ON_CLOSE mode" + + trset_rels = reader.get_obj_rels(get_obj_identifier(trset)) + assert len(trset_rels) == 0, "Expected no relationships before close in UPDATE_ON_CLOSE mode" + + # Now close to trigger rels rebuild + reader.close() + + # Reopen and verify relationships were built AFTER close + reader2 = EpcStreamReader(temp_epc_file) + + # Verify metadata is still correct + assert len(reader2) == 4, "Expected 4 objects after reopen" + + # Verify relationships NOW exist bfi_rels = reader2.get_obj_rels(get_obj_identifier(bfi)) - source_rels = [r for r in bfi_rels if r.type_value == EPCRelsRelationshipType.SOURCE_OBJECT.get_type()] - assert len(source_rels) >= 1, "Expected SOURCE relationship from bfi to bf" + assert len(bfi_rels) == 1, "Expected relationships after close in UPDATE_ON_CLOSE mode" - # Check that bf has a DESTINATION relationship from bfi bf_rels = reader2.get_obj_rels(get_obj_identifier(bf)) - dest_rels = [r for r in bf_rels if r.type_value == EPCRelsRelationshipType.DESTINATION_OBJECT.get_type()] - assert len(dest_rels) >= 1, "Expected DESTINATION relationship from bfi to bf" + assert len(bf_rels) == 2, "Expected relationships after close in UPDATE_ON_CLOSE mode" + + hi_rels = reader2.get_obj_rels(get_obj_identifier(horizon_interp)) + assert len(hi_rels) == 2, "Expected relationships after close in UPDATE_ON_CLOSE mode" + + trset_rels = reader2.get_obj_rels(get_obj_identifier(trset)) + assert len(trset_rels) == 1, "Expected relationships after close in UPDATE_ON_CLOSE mode" + + reader2.close() + + def test_update_on_close_mode_metadata_before_close_parallel(self, temp_epc_file, sample_objects): + """Test that UPDATE_ON_CLOSE mode updates metadata immediately but delays rels until close (parallel).""" + reader = EpcStreamReader( + temp_epc_file, rels_update_mode=RelsUpdateMode.UPDATE_ON_CLOSE, enable_parallel_rels=True + ) + + bf = sample_objects["bf"] + bfi = sample_objects["bfi"] + horizon_interp = sample_objects["horizon_interp"] + trset = sample_objects["trset"] + + # Add objects + reader.add_object(bf) + reader.add_object(bfi) + reader.add_object(horizon_interp) + reader.add_object(trset) + + # BEFORE closing, verify metadata is updated + objects_list = reader.list_objects() + assert len(objects_list) == 4, "Expected 4 objects in metadata before close" + assert len(reader) == 4, "Expected length of reader to be 4 before close" + + # BEFORE closing, verify NO relationships exist yet (UPDATE_ON_CLOSE behavior) + bfi_rels = reader.get_obj_rels(get_obj_identifier(bfi)) + assert len(bfi_rels) == 0, "Expected no relationships before close in UPDATE_ON_CLOSE mode" + + bf_rels = reader.get_obj_rels(get_obj_identifier(bf)) + assert len(bf_rels) == 0, "Expected no relationships before close in UPDATE_ON_CLOSE mode" + + hi_rels = reader.get_obj_rels(get_obj_identifier(horizon_interp)) + assert len(hi_rels) == 0, "Expected no relationships before close in UPDATE_ON_CLOSE mode" + + trset_rels = reader.get_obj_rels(get_obj_identifier(trset)) + assert len(trset_rels) == 0, "Expected no relationships before close in UPDATE_ON_CLOSE mode" + + # Now close to trigger rels rebuild + reader.close() + + # Reopen and verify relationships were built AFTER close + reader2 = EpcStreamReader(temp_epc_file) + + # Verify metadata is still correct + assert len(reader2) == 4, "Expected 4 objects after reopen" + + # Verify relationships NOW exist + bfi_rels = reader2.get_obj_rels(get_obj_identifier(bfi)) + assert len(bfi_rels) == 1, "Expected relationships after close in UPDATE_ON_CLOSE mode" + + bf_rels = reader2.get_obj_rels(get_obj_identifier(bf)) + assert len(bf_rels) == 2, "Expected relationships after close in UPDATE_ON_CLOSE mode" + + hi_rels = reader2.get_obj_rels(get_obj_identifier(horizon_interp)) + assert len(hi_rels) == 2, "Expected relationships after close in UPDATE_ON_CLOSE mode" + + trset_rels = reader2.get_obj_rels(get_obj_identifier(trset)) + assert len(trset_rels) == 1, "Expected relationships after close in UPDATE_ON_CLOSE mode" reader2.close() @@ -187,12 +399,37 @@ def test_update_at_modification_mode_add(self, temp_epc_file, sample_objects): # Check relationships immediately (without closing) bfi_rels = reader.get_obj_rels(get_obj_identifier(bfi)) - source_rels = [r for r in bfi_rels if r.type_value == EPCRelsRelationshipType.SOURCE_OBJECT.get_type()] - assert len(source_rels) >= 1, "Expected immediate SOURCE relationship from bfi to bf" + source_rels = [r for r in bfi_rels if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] + assert len(source_rels) == 0, "Expected no SOURCE relationships in bfi rels since bf does not refers to bfi" + + dest_rels = [r for r in bfi_rels if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT)] + assert len(dest_rels) >= 1, f"Expected immediate DESTINATION relationship from bfi to bf {bfi_rels}" bf_rels = reader.get_obj_rels(get_obj_identifier(bf)) - dest_rels = [r for r in bf_rels if r.type_value == EPCRelsRelationshipType.DESTINATION_OBJECT.get_type()] - assert len(dest_rels) >= 1, "Expected immediate DESTINATION relationship from bfi to bf" + source_rels = [r for r in bf_rels if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] + assert len(source_rels) >= 1, f"Expected immediate SOURCE relationship from bfi to bf {bf_rels}" + + reader.close() + + def test_update_at_modification_mode_add_reversed_order(self, temp_epc_file, sample_objects): + """Test that UPDATE_AT_MODIFICATION mode updates rels immediately on add even if objects are added in reversed order.""" + reader = EpcStreamReader(temp_epc_file, rels_update_mode=RelsUpdateMode.UPDATE_AT_MODIFICATION) + + bf = sample_objects["bf"] + bfi = sample_objects["bfi"] + + # Add objects in reversed order to test that relationships are created even if the interpreted feature is added after the interpretation + reader.add_object(bfi) + reader.add_object(bf) + + # Check relationships immediately (without closing) + bfi_rels = reader.get_obj_rels(get_obj_identifier(bfi)) + dest_rels = [r for r in bfi_rels if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT)] + assert len(dest_rels) >= 1, "Expected immediate DESTINATION relationship in bfi rels targeting bf" + + bf_rels = reader.get_obj_rels(get_obj_identifier(bf)) + source_rels = [r for r in bf_rels if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] + assert len(source_rels) >= 1, "Expected immediate SOURCE relationship in bf rels targeting bfi" reader.close() @@ -209,14 +446,14 @@ def test_update_at_modification_mode_remove(self, temp_epc_file, sample_objects) # Verify relationships exist bf_rels_before = reader.get_obj_rels(get_obj_identifier(bf)) - assert len(bf_rels_before) > 0, "Expected relationships before removal" + assert len(bf_rels_before) == 1, "Expected relationships before removal" # Remove bfi reader.remove_object(get_obj_identifier(bfi)) # Check that bf's rels no longer has references to bfi bf_rels_after = reader.get_obj_rels(get_obj_identifier(bf)) - bfi_refs = [r for r in bf_rels_after if get_obj_identifier(bfi) in r.id] + bfi_refs = [r for r in bf_rels_after if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] assert len(bfi_refs) == 0, "Expected no references to removed object" reader.close() @@ -256,23 +493,25 @@ def test_update_at_modification_mode_update(self, temp_epc_file, sample_objects) reader.update_object(bfi_modified) - # Check that bf no longer has DESTINATION relationship from bfi + # Check that bf no longer has SOURCE relationship from bfi bf_rels = reader.get_obj_rels(get_obj_identifier(bf)) - bfi_dest_rels = [ + bfi_source_rels = [ r for r in bf_rels - if r.type_value == EPCRelsRelationshipType.DESTINATION_OBJECT.get_type() and get_obj_identifier(bfi) in r.id + if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT) + and gen_energyml_object_path(bfi, reader.export_version) in r.target ] - assert len(bfi_dest_rels) == 0, "Expected old DESTINATION relationship to be removed" + assert len(bfi_source_rels) == 0, "Expected old SOURCE relationship to be removed" - # Check that bf2 now has DESTINATION relationship from bfi + # Check that bf2 now has SOURCE relationship from bfi bf2_rels = reader.get_obj_rels(get_obj_identifier(bf2)) - bfi_dest_rels2 = [ + bfi_source_rels2 = [ r for r in bf2_rels - if r.type_value == EPCRelsRelationshipType.DESTINATION_OBJECT.get_type() and get_obj_identifier(bfi) in r.id + if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT) + and gen_energyml_object_path(bfi, reader.export_version) in r.target ] - assert len(bfi_dest_rels2) >= 1, "Expected new DESTINATION relationship to be added" + assert len(bfi_source_rels2) >= 1, "Expected new SOURCE relationship to be added" reader.close() @@ -369,21 +608,23 @@ def test_bidirectional_relationships(self, temp_epc_file, sample_objects): # Check bfi -> bf (SOURCE) bfi_rels = reader.get_obj_rels(get_obj_identifier(bfi)) - bfi_source_to_bf = [ + bfi_dest_to_bf = [ r for r in bfi_rels - if r.type_value == EPCRelsRelationshipType.SOURCE_OBJECT.get_type() and get_obj_identifier(bf) in r.id + if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT) + and gen_energyml_object_path(bf, reader.export_version) in r.target ] - assert len(bfi_source_to_bf) >= 1 + assert len(bfi_dest_to_bf) >= 1 # Check bf -> bfi (DESTINATION) bf_rels = reader.get_obj_rels(get_obj_identifier(bf)) - bf_dest_from_bfi = [ + bf_source_from_bfi = [ r for r in bf_rels - if r.type_value == EPCRelsRelationshipType.DESTINATION_OBJECT.get_type() and get_obj_identifier(bfi) in r.id + if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT) + and gen_energyml_object_path(bfi, reader.export_version) in r.target ] - assert len(bf_dest_from_bfi) >= 1 + assert len(bf_source_from_bfi) >= 1 reader.close() @@ -392,35 +633,38 @@ def test_cascade_relationships(self, temp_epc_file, sample_objects): reader = EpcStreamReader(temp_epc_file, rels_update_mode=RelsUpdateMode.UPDATE_AT_MODIFICATION) bf = sample_objects["bf"] - bfi = sample_objects["bfi"] + hi = sample_objects["horizon_interp"] trset = sample_objects["trset"] reader.add_object(bf) - reader.add_object(bfi) + reader.add_object(hi) reader.add_object(trset) - # Check trset -> bfi - trset_rels = reader.get_obj_rels(get_obj_identifier(trset)) - trset_to_bfi = [ + # Check trset -> hi + trset_rels = reader.get_obj_rels(trset) + assert len(trset_rels) == 1, "Expected relationships in trset rels" + hi_dest_rels = [ r for r in trset_rels - if r.type_value == EPCRelsRelationshipType.SOURCE_OBJECT.get_type() and get_obj_identifier(bfi) in r.id + if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT) + and gen_energyml_object_path(hi, reader.export_version) in r.target ] - assert len(trset_to_bfi) >= 1 + assert len(hi_dest_rels) == 1, "Expected DESTINATION relationship from trset to hi" - # Check bfi -> bf - bfi_rels = reader.get_obj_rels(get_obj_identifier(bfi)) - bfi_to_bf = [ + # Check hi -> bf + hi_rels = reader.get_obj_rels(hi) + hi_to_bf = [ r - for r in bfi_rels - if r.type_value == EPCRelsRelationshipType.SOURCE_OBJECT.get_type() and get_obj_identifier(bf) in r.id + for r in hi_rels + if r.type_value == str(EPCRelsRelationshipType.DESTINATION_OBJECT) + and gen_energyml_object_path(bf, reader.export_version) in r.target ] - assert len(bfi_to_bf) >= 1 + assert len(hi_to_bf) == 1, "Expected DESTINATION relationship from hi to bf" - # Check bf has 2 DESTINATION relationships (from bfi and indirectly from trset) - bf_rels = reader.get_obj_rels(get_obj_identifier(bf)) - bf_dest_rels = [r for r in bf_rels if r.type_value == EPCRelsRelationshipType.DESTINATION_OBJECT.get_type()] - assert len(bf_dest_rels) >= 1 + # Check bf has 1 SOURCE relationships (from hi and indirectly from trset) + bf_rels = reader.get_obj_rels(bf) + bf_source_rels = [r for r in bf_rels if r.type_value == str(EPCRelsRelationshipType.SOURCE_OBJECT)] + assert len(bf_source_rels) == 1, "Expected 1 SOURCE relationship in bf rels targeting hi" reader.close() @@ -445,6 +689,106 @@ def test_independent_objects_no_rels(self, temp_epc_file, sample_objects): reader.close() + def test_external_part_reference_relationships(self, temp_epc_file, sample_objects): + """Test external part reference has EXTERNAL_PART_PROXY_TO_ML to trset20.""" + reader = EpcStreamReader(temp_epc_file, rels_update_mode=RelsUpdateMode.UPDATE_AT_MODIFICATION) + + external_ref = sample_objects["external_ref"] + trset20 = sample_objects["trset20"] + horizon_interp = sample_objects["horizon_interp"] + bf = sample_objects["bf"] + + reader.add_object(bf) + reader.add_object(horizon_interp) + reader.add_object(external_ref) + reader.add_object(trset20) + + # Get external_ref rels + external_ref_rels = reader.get_obj_rels(get_obj_identifier(external_ref)) + + # Check for EXTERNAL_PART_PROXY_TO_ML relationship + proxy_to_ml_rels = [ + r for r in external_ref_rels if r.type_value == str(EPCRelsRelationshipType.EXTERNAL_PART_PROXY_TO_ML) + ] + assert len(proxy_to_ml_rels) >= 1, "Expected EXTERNAL_PART_PROXY_TO_ML relationship from external_ref" + + # Verify target points to trset20 + trset20_path = gen_energyml_object_path(trset20, reader.export_version) + assert any( + trset20_path in r.target for r in proxy_to_ml_rels + ), "Expected relationship target to point to trset20" + + reader.close() + + def test_trset20_has_ml_to_external_part_proxy_relationship(self, temp_epc_file, sample_objects): + """Test that trset20 has ML_TO_EXTERNAL_PART_PROXY relationship to external_ref.""" + reader = EpcStreamReader(temp_epc_file, rels_update_mode=RelsUpdateMode.UPDATE_AT_MODIFICATION) + + external_ref = sample_objects["external_ref"] + trset20 = sample_objects["trset20"] + horizon_interp = sample_objects["horizon_interp"] + bf = sample_objects["bf"] + + reader.add_object(bf) + reader.add_object(horizon_interp) + reader.add_object(external_ref) + reader.add_object(trset20) + + # Get trset20 rels + trset20_rels = reader.get_obj_rels(get_obj_identifier(trset20)) + + # Check for ML_TO_EXTERNAL_PART_PROXY relationship + ml_to_proxy_rels = [ + r for r in trset20_rels if r.type_value == str(EPCRelsRelationshipType.ML_TO_EXTERNAL_PART_PROXY) + ] + assert len(ml_to_proxy_rels) >= 1, "Expected ML_TO_EXTERNAL_PART_PROXY relationship from trset20" + + # Verify target points to external_ref + external_ref_path = gen_energyml_object_path(external_ref, reader.export_version) + assert any( + external_ref_path in r.target for r in ml_to_proxy_rels + ), "Expected relationship target to point to external_ref" + + reader.close() + + def test_complete_external_ref_bidirectional_relationships(self, temp_epc_file, sample_objects): + """Test complete bidirectional relationships between trset20 and external_ref.""" + reader = EpcStreamReader(temp_epc_file, rels_update_mode=RelsUpdateMode.UPDATE_AT_MODIFICATION) + + external_ref = sample_objects["external_ref"] + trset20 = sample_objects["trset20"] + horizon_interp = sample_objects["horizon_interp"] + bf = sample_objects["bf"] + + reader.add_object(bf) + reader.add_object(horizon_interp) + reader.add_object(external_ref) + reader.add_object(trset20) + + # Check trset20 -> external_ref (ML_TO_EXTERNAL_PART_PROXY) + trset20_rels = reader.get_obj_rels(get_obj_identifier(trset20)) + external_ref_path = gen_energyml_object_path(external_ref, reader.export_version) + + ml_to_proxy = [ + r + for r in trset20_rels + if r.type_value == str(EPCRelsRelationshipType.ML_TO_EXTERNAL_PART_PROXY) and external_ref_path in r.target + ] + assert len(ml_to_proxy) >= 1, "Expected ML_TO_EXTERNAL_PART_PROXY from trset20 to external_ref" + + # Check external_ref -> trset20 (EXTERNAL_PART_PROXY_TO_ML) + external_ref_rels = reader.get_obj_rels(get_obj_identifier(external_ref)) + trset20_path = gen_energyml_object_path(trset20, reader.export_version) + + proxy_to_ml = [ + r + for r in external_ref_rels + if r.type_value == str(EPCRelsRelationshipType.EXTERNAL_PART_PROXY_TO_ML) and trset20_path in r.target + ] + assert len(proxy_to_ml) >= 1, "Expected EXTERNAL_PART_PROXY_TO_ML from external_ref to trset20" + + reader.close() + class TestCachingAndPerformance: """Test caching functionality and performance optimizations.""" @@ -482,39 +826,40 @@ def test_metadata_access_without_loading(self, temp_epc_file, sample_objects): reader.close() # Reopen and access metadata - reader2 = EpcStreamReader(temp_epc_file, preload_metadata=True) + reader2 = EpcStreamReader(temp_epc_file) # Check that we can list objects without loading them - metadata_list = reader2.list_object_metadata() + metadata_list = reader2.list_objects() assert len(metadata_list) == 2 assert reader2.stats.loaded_objects == 0, "Expected no objects loaded when accessing metadata" reader2.close() - def test_lazy_loading(self, temp_epc_file, sample_objects): - """Test that objects are loaded on-demand.""" - reader = EpcStreamReader(temp_epc_file) + # ==> no lazy loading for now + # def test_lazy_loading(self, temp_epc_file, sample_objects): + # """Test that objects are loaded on-demand.""" + # reader = EpcStreamReader(temp_epc_file) - bf = sample_objects["bf"] - bfi = sample_objects["bfi"] - trset = sample_objects["trset"] + # bf = sample_objects["bf"] + # hi = sample_objects["horizon_interp"] + # trset = sample_objects["trset"] - reader.add_object(bf) - reader.add_object(bfi) - reader.add_object(trset) + # reader.add_object(bf) + # reader.add_object(hi) + # reader.add_object(trset) - reader.close() + # reader.close() - # Reopen - reader2 = EpcStreamReader(temp_epc_file) - assert len(reader2) == 3 - assert reader2.stats.loaded_objects == 0, "Expected no objects loaded initially" + # # Reopen + # reader2 = EpcStreamReader(temp_epc_file) + # assert len(reader2) == 3 + # assert reader2.stats.loaded_objects == 0, "Expected no objects loaded initially" - # Load one object - reader2.get_object_by_identifier(get_obj_identifier(bf)) - assert reader2.stats.loaded_objects == 1, "Expected exactly 1 object loaded" + # # Load one object + # reader2.get_object_by_identifier(get_obj_identifier(bf)) + # assert reader2.stats.loaded_objects == 1, "Expected exactly 1 object loaded" - reader2.close() + # reader2.close() class TestHelperMethods: @@ -528,7 +873,7 @@ def test_gen_rels_path_from_metadata(self, temp_epc_file, sample_objects): identifier = reader.add_object(bf) metadata = reader._metadata[identifier] - rels_path = reader._gen_rels_path_from_metadata(metadata) + rels_path = reader._metadata_mgr.gen_rels_path_from_metadata(metadata) assert rels_path is not None assert "_rels/" in rels_path @@ -543,7 +888,7 @@ def test_gen_rels_path_from_identifier(self, temp_epc_file, sample_objects): bf = sample_objects["bf"] identifier = reader.add_object(bf) - rels_path = reader._gen_rels_path_from_identifier(identifier) + rels_path = reader._metadata_mgr.gen_rels_path_from_identifier(identifier) assert rels_path is not None assert "_rels/" in rels_path @@ -559,10 +904,10 @@ def test_set_rels_update_mode(self, temp_epc_file): """Test changing the relationship update mode.""" reader = EpcStreamReader(temp_epc_file, rels_update_mode=RelsUpdateMode.MANUAL) - assert reader.get_rels_update_mode() == RelsUpdateMode.MANUAL + assert reader.rels_update_mode == RelsUpdateMode.MANUAL - reader.set_rels_update_mode(RelsUpdateMode.UPDATE_AT_MODIFICATION) - assert reader.get_rels_update_mode() == RelsUpdateMode.UPDATE_AT_MODIFICATION + reader.rels_update_mode = RelsUpdateMode.UPDATE_AT_MODIFICATION + assert reader.rels_update_mode == RelsUpdateMode.UPDATE_AT_MODIFICATION reader.close() @@ -571,7 +916,7 @@ def test_invalid_mode_raises_error(self, temp_epc_file): reader = EpcStreamReader(temp_epc_file) with pytest.raises(ValueError): - reader.set_rels_update_mode("invalid_mode") + reader.rels_update_mode = "invalid_mode" reader.close() @@ -588,23 +933,12 @@ def test_remove_nonexistent_object(self, temp_epc_file): reader.close() - def test_update_nonexistent_object(self, temp_epc_file, sample_objects): - """Test updating an object that doesn't exist.""" - reader = EpcStreamReader(temp_epc_file) - - bf = sample_objects["bf"] - - with pytest.raises(ValueError): - reader.update_object(bf) - - reader.close() - def test_empty_epc_operations(self, temp_epc_file): """Test operations on empty EPC.""" reader = EpcStreamReader(temp_epc_file) assert len(reader) == 0 - assert len(reader.list_object_metadata()) == 0 + assert len(reader.list_objects()) == 0 reader.close() @@ -706,13 +1040,13 @@ def test_external_resource_preserved_on_object_update(self, temp_epc_file, sampl h5_rel = Relationship( target="data/test_data.h5", - type_value=EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(), + type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), id=f"_external_{identifier}_h5", ) - reader.add_rels_for_object(identifier, [h5_rel], write_immediately=True) + reader.add_rels_for_object(identifier, [h5_rel]) # Verify the HDF5 path is returned - h5_paths_before = reader.get_h5_file_paths(identifier) + h5_paths_before = reader.get_h5_file_paths(identifier, False) assert "data/test_data.h5" in h5_paths_before # Update the object (modify its title) @@ -720,12 +1054,12 @@ def test_external_resource_preserved_on_object_update(self, temp_epc_file, sampl reader.update_object(trset) # Verify EXTERNAL_RESOURCE relationship is still present - h5_paths_after = reader.get_h5_file_paths(identifier) + h5_paths_after = reader.get_h5_file_paths(identifier, False) assert "data/test_data.h5" in h5_paths_after, "EXTERNAL_RESOURCE relationship was lost after update" # Also verify by checking rels directly rels = reader.get_obj_rels(identifier) - external_rels = [r for r in rels if r.type_value == EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type()] + external_rels = [r for r in rels if r.type_value == str(EPCRelsRelationshipType.EXTERNAL_RESOURCE)] assert len(external_rels) > 0, "EXTERNAL_RESOURCE relationship not found in rels" assert any("test_data.h5" in r.target for r in external_rels) @@ -744,13 +1078,13 @@ def test_external_resource_preserved_when_referenced_by_other(self, temp_epc_fil h5_rel = Relationship( target="data/boundary_data.h5", - type_value=EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(), + type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), id=f"_external_{bf_id}_h5", ) - reader.add_rels_for_object(bf_id, [h5_rel], write_immediately=True) + reader.add_rels_for_object(bf_id, [h5_rel]) # Verify initial state - h5_paths_initial = reader.get_h5_file_paths(bf_id) + h5_paths_initial = reader.get_h5_file_paths(bf_id, False) assert "data/boundary_data.h5" in h5_paths_initial # Add BoundaryFeatureInterpretation that references the BoundaryFeature @@ -759,12 +1093,12 @@ def test_external_resource_preserved_when_referenced_by_other(self, temp_epc_fil reader.add_object(bfi) # Verify EXTERNAL_RESOURCE is still present after adding referencing object - h5_paths_after = reader.get_h5_file_paths(bf_id) + h5_paths_after = reader.get_h5_file_paths(bf_id, False) assert "data/boundary_data.h5" in h5_paths_after, "EXTERNAL_RESOURCE lost after adding referencing object" # Verify rels directly rels = reader.get_obj_rels(bf_id) - external_rels = [r for r in rels if r.type_value == EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type()] + external_rels = [r for r in rels if r.type_value == str(EPCRelsRelationshipType.EXTERNAL_RESOURCE)] assert len(external_rels) > 0 assert any("boundary_data.h5" in r.target for r in external_rels) @@ -783,10 +1117,10 @@ def test_external_resource_preserved_update_on_close_mode(self, temp_epc_file, s h5_rel = Relationship( target="data/test_data.h5", - type_value=EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(), + type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), id=f"_external_{identifier}_h5", ) - reader.add_rels_for_object(identifier, [h5_rel], write_immediately=True) + reader.add_rels_for_object(identifier, [h5_rel]) # Update object trset.citation.title = "Modified in UPDATE_ON_CLOSE mode" @@ -797,7 +1131,7 @@ def test_external_resource_preserved_update_on_close_mode(self, temp_epc_file, s # Reopen and verify reader2 = EpcStreamReader(temp_epc_file) - h5_paths = reader2.get_h5_file_paths(identifier) + h5_paths = reader2.get_h5_file_paths(identifier, False) assert "data/test_data.h5" in h5_paths, "EXTERNAL_RESOURCE lost after close in UPDATE_ON_CLOSE mode" reader2.close() @@ -815,24 +1149,24 @@ def test_multiple_external_resources_preserved(self, temp_epc_file, sample_objec h5_rels = [ Relationship( target="data/geometry.h5", - type_value=EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(), + type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), id=f"_external_{identifier}_geometry", ), Relationship( target="data/properties.h5", - type_value=EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(), + type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), id=f"_external_{identifier}_properties", ), Relationship( target="data/metadata.h5", - type_value=EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(), + type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), id=f"_external_{identifier}_metadata", ), ] - reader.add_rels_for_object(identifier, h5_rels, write_immediately=True) + reader.add_rels_for_object(identifier, h5_rels) # Verify all are present - h5_paths_before = reader.get_h5_file_paths(identifier) + h5_paths_before = reader.get_h5_file_paths(identifier, False) assert "data/geometry.h5" in h5_paths_before assert "data/properties.h5" in h5_paths_before assert "data/metadata.h5" in h5_paths_before @@ -842,7 +1176,7 @@ def test_multiple_external_resources_preserved(self, temp_epc_file, sample_objec reader.update_object(trset) # Verify all EXTERNAL_RESOURCE relationships are still present - h5_paths_after = reader.get_h5_file_paths(identifier) + h5_paths_after = reader.get_h5_file_paths(identifier, False) assert "data/geometry.h5" in h5_paths_after assert "data/properties.h5" in h5_paths_after assert "data/metadata.h5" in h5_paths_after @@ -868,13 +1202,13 @@ def test_external_resource_preserved_cascade_updates(self, temp_epc_file, sample h5_rel = Relationship( target="data/bf_data.h5", - type_value=EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(), + type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), id=f"_external_{bf_id}_h5", ) - reader.add_rels_for_object(bf_id, [h5_rel], write_immediately=True) + reader.add_rels_for_object(bf_id, [h5_rel]) # Verify initial state - h5_paths = reader.get_h5_file_paths(bf_id) + h5_paths = reader.get_h5_file_paths(bf_id, False) assert "data/bf_data.h5" in h5_paths # Update intermediate object (bfi) @@ -886,7 +1220,7 @@ def test_external_resource_preserved_cascade_updates(self, temp_epc_file, sample reader.update_object(trset) # Verify EXTERNAL_RESOURCE still present after cascade of updates - h5_paths_final = reader.get_h5_file_paths(bf_id) + h5_paths_final = reader.get_h5_file_paths(bf_id, False) assert "data/bf_data.h5" in h5_paths_final, "EXTERNAL_RESOURCE lost after cascade updates" reader.close() @@ -907,13 +1241,13 @@ def test_external_resource_with_object_removal(self, temp_epc_file, sample_objec h5_rel = Relationship( target="data/bfi_data.h5", - type_value=EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type(), + type_value=str(EPCRelsRelationshipType.EXTERNAL_RESOURCE), id=f"_external_{bfi_id}_h5", ) - reader.add_rels_for_object(bfi_id, [h5_rel], write_immediately=True) + reader.add_rels_for_object(bfi_id, [h5_rel]) # Verify it exists - h5_paths = reader.get_h5_file_paths(bfi_id) + h5_paths = reader.get_h5_file_paths(bfi_id, False) assert "data/bfi_data.h5" in h5_paths # Remove bf (which bfi references) @@ -924,7 +1258,7 @@ def test_external_resource_with_object_removal(self, temp_epc_file, sample_objec reader.update_object(bfi) # Verify EXTERNAL_RESOURCE is still there - h5_paths_after = reader.get_h5_file_paths(bfi_id) + h5_paths_after = reader.get_h5_file_paths(bfi_id, False) assert "data/bfi_data.h5" in h5_paths_after, "EXTERNAL_RESOURCE lost after referenced object removal" reader.close() diff --git a/energyml-utils/tests/test_epc_utils.py b/energyml-utils/tests/test_epc_utils.py new file mode 100644 index 0000000..05c88f1 --- /dev/null +++ b/energyml-utils/tests/test_epc_utils.py @@ -0,0 +1,630 @@ +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 + +""" +Comprehensive unit tests for epc_utils module. +Excludes EPC structure validation and property kind functions as per requirements. +""" + +import pytest +from pathlib import Path + +from energyml.utils.epc_utils import ( + gen_core_props_rels_path, + gen_core_props_path, + gen_energyml_object_path, + gen_rels_path, + gen_rels_path_from_obj_path, + get_epc_content_type_path, + extract_uuid_and_version_from_obj_path, + create_h5_external_relationship, + create_default_core_properties, + create_default_types, + match_external_proxy_type, + get_rels_dor_type, + as_dor, + create_energyml_object, + create_external_part_reference, + get_reverse_dor_list, + get_file_folder_and_name_from_path, +) +from energyml.utils.constants import EpcExportVersion, EPCRelsRelationshipType, MimeType, gen_uuid +from energyml.opc.opc import Relationship, TargetMode, CoreProperties, Types +from energyml.utils.uri import Uri, parse_uri +from energyml.utils.introspection import get_obj_uuid, get_obj_version +from energyml.eml.v2_3.commonv2 import Citation, DataObjectReference +from energyml.resqml.v2_2.resqmlv2 import ( + TriangulatedSetRepresentation, + TrianglePatch, + PointGeometry, + Point3DExternalArray, +) +from energyml.resqml.v2_0_1.resqmlv2 import ObjTriangulatedSetRepresentation as TriangulatedSetRepresentation201 + + +# ============================================================================= +# TEST FIXTURES - Reusable test data +# ============================================================================= + +TEST_UUID = "12345678-1234-1234-1234-123456789abc" +TEST_UUID_2 = "abcd5678-90ef-1234-5678-abcdef123456" + + +@pytest.fixture +def sample_citation(): + """Create a sample Citation object for testing.""" + return Citation( + title="Test Object", + originator="Test Originator", + creation="2024-01-01T00:00:00Z", + format="energyml-utils test", + ) + + +@pytest.fixture +def sample_triangulated_set_22(sample_citation): + """Create a sample TriangulatedSetRepresentation (RESQML 2.2) for testing.""" + obj = TriangulatedSetRepresentation( + uuid=TEST_UUID, + citation=sample_citation, + schema_version="2.2", + ) + return obj + + +@pytest.fixture +def sample_triangulated_set_201(): + """Create a sample TriangulatedSetRepresentation (RESQML 2.0.1) for testing.""" + citation = Citation( + title="Test Object 201", + originator="Test", + creation="2024-01-01T00:00:00Z", + ) + obj = TriangulatedSetRepresentation201( + uuid=TEST_UUID_2, + citation=citation, + schema_version="2.0.1", + ) + return obj + + +# ============================================================================= +# TEST CLASSES +# ============================================================================= + + +class TestPathGenerationFunctions: + """Test suite for EPC path generation utility functions.""" + + def test_gen_core_props_rels_path(self): + """Test generation of core properties rels file path.""" + result = gen_core_props_rels_path() + assert isinstance(result, str) + assert result == "docProps/_rels/core.xml.rels" + + def test_gen_core_props_path_classic(self): + """Test core properties path generation for classic export.""" + result = gen_core_props_path(EpcExportVersion.CLASSIC) + assert result == "docProps/core.xml" + + def test_gen_core_props_path_expanded(self): + """Test core properties path generation for expanded export.""" + result = gen_core_props_path(EpcExportVersion.EXPANDED) + assert result == "docProps/core.xml" + + def test_gen_core_props_path_default(self): + """Test core properties path generation with default export version.""" + result = gen_core_props_path() + assert result == "docProps/core.xml" + + def test_get_epc_content_type_path(self): + """Test content types file path generation.""" + result = get_epc_content_type_path() + assert result == "[Content_Types].xml" + + def test_gen_rels_path_from_obj_path_simple(self): + """Test rels path generation from simple object path.""" + obj_path = "ObjType_12345678-1234-1234-1234-123456789abc.xml" + result = gen_rels_path_from_obj_path(obj_path) + assert result == "_rels/ObjType_12345678-1234-1234-1234-123456789abc.xml.rels" + + def test_gen_rels_path_from_obj_path_with_folder(self): + """Test rels path generation from path with folders.""" + obj_path = "namespace_resqml22/version_1.0/Grid2dRepresentation_abc-123.xml" + result = gen_rels_path_from_obj_path(obj_path) + assert result == "namespace_resqml22/version_1.0/_rels/Grid2dRepresentation_abc-123.xml.rels" + + def test_gen_rels_path_from_obj_path_with_path_object(self): + """Test rels path generation with Path object input.""" + obj_path = Path("folder/subfolder/Object_uuid.xml") + result = gen_rels_path_from_obj_path(obj_path) + assert result == "folder/subfolder/_rels/Object_uuid.xml.rels" + + def test_gen_rels_path_from_obj_path_raises_error_for_rels_folder(self): + """Test that error is raised when object path is in _rels folder.""" + obj_path = "_rels/Object_uuid.xml.rels" + with pytest.raises(ValueError, match="cannot be in the '_rels' folder"): + gen_rels_path_from_obj_path(obj_path) + + def test_extract_uuid_and_version_from_obj_path_simple(self): + """Test UUID and version extraction from simple path.""" + obj_path = "Grid2dRepresentation_12345678-1234-1234-1234-123456789abc.xml" + uuid, version = extract_uuid_and_version_from_obj_path(obj_path) + assert uuid == "12345678-1234-1234-1234-123456789abc" + assert version is None + + def test_extract_uuid_and_version_from_obj_path_with_version(self): + """Test UUID and version extraction from versioned path.""" + obj_path = "namespace_resqml22/version_2.5/Grid_abcd1234-5678-90ab-cdef-123456789012.xml" + uuid, version = extract_uuid_and_version_from_obj_path(obj_path) + assert uuid == "abcd1234-5678-90ab-cdef-123456789012" + assert version == "2.5" + + def test_extract_uuid_and_version_from_obj_path_invalid(self): + """Test error when no UUID found in path.""" + obj_path = "invalid_path_without_uuid.xml" + with pytest.raises(ValueError, match="Cannot extract uuid"): + extract_uuid_and_version_from_obj_path(obj_path) + + def test_get_file_folder_and_name_from_path_with_folder(self): + """Test folder and filename extraction from path with folder.""" + path = "folder/subfolder/file.xml" + folder, filename = get_file_folder_and_name_from_path(path) + assert folder == "folder/subfolder/" + assert filename == "file.xml" + + def test_get_file_folder_and_name_from_path_without_folder(self): + """Test folder and filename extraction from path without folder.""" + path = "file.xml" + folder, filename = get_file_folder_and_name_from_path(path) + assert folder == "" + assert filename == "file.xml" + + def test_get_file_folder_and_name_from_path_multiple_levels(self): + """Test folder and filename extraction from deeply nested path.""" + path = "level1/level2/level3/level4/data.xml" + folder, filename = get_file_folder_and_name_from_path(path) + assert folder == "level1/level2/level3/level4/" + assert filename == "data.xml" + + +class TestEnergyMLObjectPathGeneration: + """Test suite for EnergyML object path generation.""" + + def test_gen_energyml_object_path_classic_resqml22(self, sample_triangulated_set_22): + """Test classic EPC path generation for RESQML 2.2 object.""" + result = gen_energyml_object_path(sample_triangulated_set_22, EpcExportVersion.CLASSIC) + assert result == f"TriangulatedSetRepresentation_{TEST_UUID}.xml" + + def test_gen_energyml_object_path_classic_resqml201(self, sample_triangulated_set_201): + """Test classic EPC path generation for RESQML 2.0.1 object.""" + result = gen_energyml_object_path(sample_triangulated_set_201, EpcExportVersion.CLASSIC) + assert result == f"obj_TriangulatedSetRepresentation_{TEST_UUID_2}.xml" + + def test_gen_energyml_object_path_expanded_without_version(self, sample_triangulated_set_22): + """Test expanded EPC path generation without object version.""" + result = gen_energyml_object_path(sample_triangulated_set_22, EpcExportVersion.EXPANDED) + assert f"namespace_resqml22/TriangulatedSetRepresentation_{TEST_UUID}.xml" == result + + def test_gen_energyml_object_path_expanded_with_version(self, sample_triangulated_set_22): + """Test expanded EPC path generation with object version.""" + sample_triangulated_set_22.object_version = "3.1" + result = gen_energyml_object_path(sample_triangulated_set_22, EpcExportVersion.EXPANDED) + assert ( + f"namespace_resqml22/version_{sample_triangulated_set_22.object_version}/TriangulatedSetRepresentation_{TEST_UUID}.xml" + == result + ) + + def test_gen_energyml_object_path_from_uri_string(self): + """Test path generation from URI string.""" + uri_str = f"eml:///resqml22.TriangulatedSetRepresentation({TEST_UUID})" + result = gen_energyml_object_path(uri_str, EpcExportVersion.CLASSIC) + assert f"TriangulatedSetRepresentation_{TEST_UUID}.xml" in result + + def test_gen_energyml_object_path_from_uri_object(self): + """Test path generation from Uri object.""" + uri = Uri( + domain="resqml", + domain_version="22", + object_type="TriangulatedSetRepresentation", + uuid=TEST_UUID, + ) + result = gen_energyml_object_path(uri, EpcExportVersion.CLASSIC) + assert f"TriangulatedSetRepresentation_{TEST_UUID}.xml" in result + + def test_gen_energyml_object_path_raises_error_no_uuid(self, sample_citation): + """Test error is raised when object has no UUID.""" + obj = TriangulatedSetRepresentation( + citation=sample_citation, + schema_version="2.2", + ) + # Don't set UUID + with pytest.raises(ValueError, match="must have a valid uuid"): + gen_energyml_object_path(obj, EpcExportVersion.CLASSIC) + + def test_gen_energyml_object_path_types(self): + """Test path generation for Types object.""" + types_ = Types() + result = gen_energyml_object_path(types_) + assert result == "[Content_Types].xml" + + def test_gen_rels_path_with_types(self): + """Test rels path generation for Types object.""" + types_ = Types() + result = gen_rels_path(types_) + assert result == "_rels/.rels" + + def test_gen_energyml_object_path_core_properties(self): + """Test path generation for CoreProperties object.""" + core_props = CoreProperties() + result = gen_energyml_object_path(core_props) + assert result == "docProps/core.xml" + + def test_gen_rels_path_with_core_properties(self): + """Test rels path generation for CoreProperties object.""" + core_props = CoreProperties() + result = gen_rels_path(core_props) + assert result == "docProps/_rels/core.xml.rels" + + def test_gen_rels_path_with_energyml_object_classic(self, sample_triangulated_set_22): + """Test rels path generation for EnergyML object in classic mode.""" + result = gen_rels_path(sample_triangulated_set_22, EpcExportVersion.CLASSIC) + assert result == f"_rels/TriangulatedSetRepresentation_{TEST_UUID}.xml.rels" + + def test_gen_rels_path_with_energyml_object_expanded(self, sample_triangulated_set_22): + """Test rels path generation for EnergyML object in expanded mode.""" + result = gen_rels_path(sample_triangulated_set_22, EpcExportVersion.EXPANDED) + assert "_rels/" in result + assert f"TriangulatedSetRepresentation_{TEST_UUID}.xml.rels" in result + + +class TestRelationshipFunctions: + """Test suite for relationship creation and management.""" + + def test_create_h5_external_relationship_default_index(self): + """Test HDF5 external relationship creation with default index.""" + h5_path = "external_data.h5" + result = create_h5_external_relationship(h5_path) + + assert isinstance(result, Relationship) + assert result.target == h5_path + assert result.type_value == EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type() + assert result.id == "Hdf5File" + assert result.target_mode == TargetMode.EXTERNAL + + def test_create_h5_external_relationship_custom_index(self): + """Test HDF5 external relationship creation with custom index.""" + h5_path = "data/measurements.h5" + result = create_h5_external_relationship(h5_path, current_idx=2) + + assert result.target == h5_path + assert result.id == "Hdf5File3" + assert result.type_value == EPCRelsRelationshipType.EXTERNAL_RESOURCE.get_type() + + def test_create_h5_external_relationship_zero_index(self): + """Test HDF5 external relationship creation with zero index.""" + h5_path = "test.h5" + result = create_h5_external_relationship(h5_path, current_idx=0) + + assert result.id == "Hdf5File" + + def test_get_reverse_dor_list_empty(self): + """Test reverse DOR list with empty object list.""" + result = get_reverse_dor_list([]) + assert result == {} + + def test_get_reverse_dor_list_with_objects(self, sample_triangulated_set_22, sample_citation): + """Test reverse DOR list with objects containing DORs.""" + # Create object with DOR + dor = as_dor(create_energyml_object("eml23.ProjectedCrs")) + + # Create triangulated set with patches that might have DORs + obj_with_dor = TriangulatedSetRepresentation( + uuid=gen_uuid(), + citation=sample_citation, + schema_version="2.2", + ) + # Adding a patch with contact element that has DOR + patch = TrianglePatch( + geometry=PointGeometry(points=Point3DExternalArray(), local_crs=dor) + ) # Reference to another object + obj_with_dor.triangle_patch = [patch] + + result = get_reverse_dor_list([obj_with_dor]) + + # Verify the result format - should have entries + assert isinstance(result, dict) + + +class TestDefaultObjectCreation: + """Test suite for default object creation functions.""" + + def test_create_default_core_properties_no_creator(self): + """Test default core properties creation without custom creator.""" + result = create_default_core_properties() + + assert isinstance(result, CoreProperties) + assert result.created is not None + assert result.creator is not None + assert "energyml-utils" in result.creator.any_element + assert result.identifier is not None + assert result.version == "1.0" + # Verify the identifier is a valid UUID format + assert "urn:uuid:" in result.identifier.any_element + + def test_create_default_core_properties_custom_creator(self): + """Test default core properties creation with custom creator.""" + custom_creator = "TestOrganization" + result = create_default_core_properties(creator=custom_creator) + + assert result.creator.any_element == custom_creator + assert result.version == "1.0" + assert result.created is not None + + def test_create_default_types(self): + """Test default Types object creation.""" + result = create_default_types() + + assert isinstance(result, Types) + assert len(result.default) == 1 + assert result.default[0].extension == "rels" + assert result.default[0].content_type == str(MimeType.RELS) + assert len(result.override) == 1 + assert result.override[0].content_type == str(MimeType.CORE_PROPERTIES) + assert result.override[0].part_name == "docProps/core.xml" + + +class TestExternalProxyMatching: + """Test suite for external proxy type matching.""" + + def test_match_external_proxy_type_with_valid_strings(self): + """Test matching external proxy type with valid strings.""" + assert match_external_proxy_type("EpcExternalPartReference") is True + assert match_external_proxy_type("eml23.EpcExternalPartReference") is True + assert match_external_proxy_type("external_reference") is True + assert match_external_proxy_type("EXTERNAL_REFERENCE") is True + + def test_match_external_proxy_type_with_invalid_strings(self): + """Test matching external proxy type with invalid strings.""" + assert match_external_proxy_type("Grid2dRepresentation") is False + assert match_external_proxy_type("WellboreTrajectory") is False + assert match_external_proxy_type("random_type") is False + assert match_external_proxy_type("TriangulatedSetRepresentation") is False + + def test_match_external_proxy_type_case_insensitive(self): + """Test that matching is case-insensitive.""" + assert match_external_proxy_type("External_Reference") is True + assert match_external_proxy_type("EXTERNAL_PART_REFERENCE") is True + + def test_match_external_proxy_type_with_path(self): + """Test matching external proxy type from file path.""" + assert match_external_proxy_type("path/to/obj_EpcExternalPartReference_uuid.xml") is True + + +class TestRelsDorType: + """Test suite for get_rels_dor_type function.""" + + def test_external_proxy_in_owner_rels_file(self): + """Test external proxy reference from the owner's rels file perspective.""" + # When we're in the rels file of an object that references an external proxy + result = get_rels_dor_type("EpcExternalPartReference", in_dor_owner_rels_file=True) + assert result == str(EPCRelsRelationshipType.ML_TO_EXTERNAL_PART_PROXY) + + def test_external_proxy_in_target_rels_file(self): + """Test external proxy reference from the target's rels file perspective.""" + # When we're in the rels file of the external proxy itself + result = get_rels_dor_type("eml23.EpcExternalPartReference", in_dor_owner_rels_file=False) + assert result == str(EPCRelsRelationshipType.EXTERNAL_PART_PROXY_TO_ML) + + def test_regular_object_in_owner_rels_file(self): + """Test regular object reference from the owner's rels file perspective.""" + # When we're in the rels file of an object that references a regular EnergyML object + result = get_rels_dor_type("resqml22.TriangulatedSetRepresentation", in_dor_owner_rels_file=True) + assert result == str(EPCRelsRelationshipType.DESTINATION_OBJECT) + + def test_regular_object_in_target_rels_file(self): + """Test regular object reference from the target's rels file perspective.""" + # When we're in the rels file of the referenced object + result = get_rels_dor_type("resqml22.Grid2dRepresentation", in_dor_owner_rels_file=False) + assert result == str(EPCRelsRelationshipType.SOURCE_OBJECT) + + def test_with_uri_object_external_proxy(self): + """Test with Uri object pointing to external proxy.""" + uri = Uri( + domain="eml", + domain_version="23", + object_type="EpcExternalPartReference", + uuid=TEST_UUID, + ) + result = get_rels_dor_type(uri, in_dor_owner_rels_file=True) + assert result == str(EPCRelsRelationshipType.ML_TO_EXTERNAL_PART_PROXY) + + def test_with_uri_object_regular_object(self): + """Test with Uri object pointing to regular EnergyML object.""" + uri = Uri( + domain="resqml", + domain_version="22", + object_type="TriangulatedSetRepresentation", + uuid=TEST_UUID, + ) + result = get_rels_dor_type(uri, in_dor_owner_rels_file=False) + assert result == str(EPCRelsRelationshipType.SOURCE_OBJECT) + + def test_with_energyml_object(self, sample_triangulated_set_22): + """Test with actual EnergyML object.""" + # Regular EnergyML object from owner perspective + result = get_rels_dor_type(sample_triangulated_set_22, in_dor_owner_rels_file=True) + assert result == str(EPCRelsRelationshipType.DESTINATION_OBJECT) + + def test_all_four_scenarios(self): + """Test all four possible combinations of external/regular × owner/target.""" + # Scenario 1: External proxy, owner's perspective + result1 = get_rels_dor_type("external_reference", True) + assert result1 == str(EPCRelsRelationshipType.ML_TO_EXTERNAL_PART_PROXY) + + # Scenario 2: External proxy, target's perspective + result2 = get_rels_dor_type("external_reference", False) + assert result2 == str(EPCRelsRelationshipType.EXTERNAL_PART_PROXY_TO_ML) + + # Scenario 3: Regular object, owner's perspective + result3 = get_rels_dor_type("WellboreTrajectory", True) + assert result3 == str(EPCRelsRelationshipType.DESTINATION_OBJECT) + + # Scenario 4: Regular object, target's perspective + result4 = get_rels_dor_type("WellboreTrajectory", False) + assert result4 == str(EPCRelsRelationshipType.SOURCE_OBJECT) + + +class TestDORCreation: + """Test suite for Data Object Reference (DOR) creation.""" + + def test_as_dor_with_none(self): + """Test DOR creation with None input returns None.""" + result = as_dor(None) + assert result is None + + def test_as_dor_from_uri_string(self): + """Test DOR creation from URI string.""" + uri_string = f"eml:///resqml22.TriangulatedSetRepresentation({TEST_UUID})" + result = as_dor(uri_string, dor_qualified_type="eml23.DataObjectReference") + + assert isinstance(result, DataObjectReference) + assert get_obj_uuid(result) == TEST_UUID + + def test_as_dor_from_energyml_object(self, sample_triangulated_set_22): + """Test DOR creation from EnergyML object.""" + result = as_dor(sample_triangulated_set_22, dor_qualified_type="eml23.DataObjectReference") + + assert isinstance(result, DataObjectReference) + assert get_obj_uuid(result) == TEST_UUID + assert result.title == "Test Object" + + def test_as_dor_from_existing_dor(self): + """Test DOR conversion from one DOR type to another.""" + source_dor = DataObjectReference( + uuid=TEST_UUID, + title="Original DOR", + qualified_type="resqml22.TriangulatedSetRepresentation", + ) + + result = as_dor(source_dor, dor_qualified_type="eml23.DataObjectReference") + + assert isinstance(result, DataObjectReference) + assert get_obj_uuid(result) == TEST_UUID + assert result.qualified_type == "resqml22.TriangulatedSetRepresentation" + assert result.title == "Original DOR" + + def test_as_dor_with_version(self, sample_triangulated_set_22): + """Test DOR creation preserves object version.""" + sample_triangulated_set_22.object_version = "2.5" + result = as_dor(sample_triangulated_set_22, dor_qualified_type="eml23.DataObjectReference") + + assert get_obj_version(result) == "2.5" + + def test_as_dor_from_uri_object(self): + """Test DOR creation from Uri object.""" + uri = Uri( + domain="resqml", + domain_version="22", + object_type="TriangulatedSetRepresentation", + uuid=TEST_UUID, + version="1.0", + ) + + result = as_dor(uri, dor_qualified_type="eml23.DataObjectReference") + + assert isinstance(result, DataObjectReference) + assert get_obj_uuid(result) == TEST_UUID + assert get_obj_version(result) == "1.0" + + +class TestObjectCreation: + """Test suite for EnergyML object creation functions.""" + + def test_create_energyml_object_with_defaults(self): + """Test EnergyML object creation with default parameters.""" + result = create_energyml_object("resqml22.TriangulatedSetRepresentation") + + assert isinstance(result, TriangulatedSetRepresentation) + assert result.citation is not None + assert result.citation.title == "New_Object" + assert get_obj_uuid(result) is not None + assert result.schema_version == "2.2" + + def test_create_energyml_object_with_custom_citation(self): + """Test EnergyML object creation with custom citation.""" + custom_citation = { + "title": "Custom Test Object", + "originator": "Test Organization", + } + + result = create_energyml_object( + "resqml22.TriangulatedSetRepresentation", + citation=custom_citation, + ) + + assert result.citation.title == "Custom Test Object" + assert result.citation.originator == "Test Organization" + + def test_create_energyml_object_with_custom_uuid(self): + """Test EnergyML object creation with custom UUID.""" + custom_uuid = TEST_UUID + + result = create_energyml_object( + "resqml22.TriangulatedSetRepresentation", + uuid=custom_uuid, + ) + + assert get_obj_uuid(result) == custom_uuid + + def test_create_energyml_object_resqml201(self): + """Test EnergyML object creation for RESQML 2.0.1.""" + result = create_energyml_object("resqml20.obj_TriangulatedSetRepresentation") + + assert isinstance(result, TriangulatedSetRepresentation201) + assert result.schema_version == "2.0" + + def test_create_external_part_reference_22(self): + """Test external part reference creation for EML 2.2.""" + h5_path = "data/external.h5" + result = create_external_part_reference("2.2", h5_path) + + assert result is not None + assert get_obj_uuid(result) is not None + # Note: The actual attributes depend on the EpcExternalPartReference schema + + def test_create_external_part_reference_20(self): + """Test external part reference creation for EML 2.0.""" + h5_path = "test.h5" + result = create_external_part_reference("2.0", h5_path) + + assert result is not None + assert get_obj_uuid(result) is not None + + def test_create_external_part_reference_with_custom_params(self): + """Test external part reference creation with custom citation and UUID.""" + custom_citation = {"title": "External Data Reference"} + custom_uuid = TEST_UUID_2 + + result = create_external_part_reference( + "2.1", + "external.h5", + citation=custom_citation, + uuid=custom_uuid, + ) + + assert get_obj_uuid(result) == custom_uuid + + def test_create_external_part_reference_version_formats(self): + """Test external part reference creation with different version formats.""" + # Test with dotted version + result1 = create_external_part_reference("2.2", "test1.h5") + assert result1 is not None + + # Test with underscore version + result2 = create_external_part_reference("2_1", "test2.h5") + assert result2 is not None + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/energyml-utils/tests/test_epc_validator.py b/energyml-utils/tests/test_epc_validator.py new file mode 100644 index 0000000..57347c3 --- /dev/null +++ b/energyml-utils/tests/test_epc_validator.py @@ -0,0 +1,646 @@ +# Copyright (c) 2023-2024 Geosiris. +# SPDX-License-Identifier: Apache-2.0 +""" +Unit tests for EPC validator module. + +Tests comprehensive validation of EPC (Energistics Packaging Conventions) files +according to the EPC v1.0 specification. +""" + +import io +import zipfile +from pathlib import Path +from typing import Optional + +import pytest + +from energyml.opc.opc import ( + CoreProperties, + Created, + Creator, + Default, + Identifier, + Override, + Relationship, + Relationships, + TargetMode, + Types, +) +from energyml.utils.epc_validator import ( + EpcParser, + EpcValidator, + ValidationResult, + validate_epc_file, +) +from energyml.utils.exception import ( + ContentTypeValidationError, + CorePropertiesValidationError, + InvalidXmlStructureError, + MissingRequiredFileError, + NamingConventionError, + RelationshipValidationError, + ZipIntegrityError, +) +from energyml.utils.serialization import serialize_xml + + +class TestValidationResult: + """Test ValidationResult class.""" + + def test_validation_result_initialization(self): + """Test ValidationResult initializes correctly.""" + result = ValidationResult() + assert result.is_valid is True + assert len(result.errors) == 0 + assert len(result.warnings) == 0 + assert len(result.info) == 0 + + def test_add_error_marks_invalid(self): + """Test adding error marks validation as invalid.""" + result = ValidationResult() + result.add_error("Test error") + assert result.is_valid is False + assert len(result.errors) == 1 + assert result.errors[0] == "Test error" + + def test_add_warning_keeps_valid(self): + """Test adding warning doesn't affect validity.""" + result = ValidationResult() + result.add_warning("Test warning") + assert result.is_valid is True + assert len(result.warnings) == 1 + + def test_add_info(self): + """Test adding info message.""" + result = ValidationResult() + result.add_info("Test info") + assert len(result.info) == 1 + + def test_str_representation(self): + """Test string representation of ValidationResult.""" + result = ValidationResult() + result.add_error("Error 1") + result.add_warning("Warning 1") + result.add_info("Info 1") + + output = str(result) + assert "FAILED" in output + assert "Error 1" in output + assert "Warning 1" in output + assert "Info 1" in output + + +class TestEpcParser: + """Test EPC parser functionality.""" + + @pytest.fixture + def minimal_epc(self) -> io.BytesIO: + """Create minimal valid EPC file in memory.""" + buffer = io.BytesIO() + + # Create content types + content_types = Types( + default=[ + Default(extension="rels", content_type="application/vnd.openxmlformats-package.relationships+xml") + ], + override=[ + Override( + part_name="/docProps/core.xml", + content_type="application/vnd.openxmlformats-package.core-properties+xml", + ) + ], + ) + + # Create core properties + core_props = CoreProperties( + created=Created(any_element="2024-01-01T00:00:00Z"), + creator=Creator(any_element="Test Creator"), + identifier=Identifier(any_element="test-identifier"), + ) + + # Create root relationships + root_rels = Relationships( + relationship=[ + Relationship( + id="CoreProperties", + type_value="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties", + target="docProps/core.xml", + ) + ] + ) + + # Create ZIP file + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", serialize_xml(content_types)) + zf.writestr("_rels/.rels", serialize_xml(root_rels)) + zf.writestr("docProps/core.xml", serialize_xml(core_props)) + + buffer.seek(0) + return buffer + + def test_parser_context_manager(self, minimal_epc): + """Test parser as context manager.""" + with EpcParser(minimal_epc) as parser: + files = parser.list_files() + assert len(files) > 0 + + def test_parser_open_close(self, minimal_epc): + """Test explicit open/close.""" + parser = EpcParser(minimal_epc) + parser.open() + files = parser.list_files() + assert len(files) > 0 + parser.close() + + def test_parser_list_files(self, minimal_epc): + """Test listing files in archive.""" + with EpcParser(minimal_epc) as parser: + files = parser.list_files() + assert "[Content_Types].xml" in files + assert "_rels/.rels" in files + + def test_parser_read_file(self, minimal_epc): + """Test reading file from archive.""" + with EpcParser(minimal_epc) as parser: + content = parser.read_file("[Content_Types].xml") + assert content is not None + assert len(content) > 0 + + def test_parser_read_missing_file(self, minimal_epc): + """Test reading non-existent file raises error.""" + with EpcParser(minimal_epc) as parser: + with pytest.raises(MissingRequiredFileError): + parser.read_file("non_existent.xml") + + def test_parse_content_types(self, minimal_epc): + """Test parsing content types.""" + with EpcParser(minimal_epc) as parser: + content_types = parser.parse_content_types() + assert content_types is not None + assert len(content_types.default) > 0 + + def test_parse_core_properties(self, minimal_epc): + """Test parsing core properties.""" + with EpcParser(minimal_epc) as parser: + core_props = parser.parse_core_properties() + assert core_props is not None + + def test_parse_relationships(self, minimal_epc): + """Test parsing relationships.""" + with EpcParser(minimal_epc) as parser: + rels = parser.parse_relationships("_rels/.rels") + assert rels is not None + + def test_find_all_rels_files(self, minimal_epc): + """Test finding all .rels files.""" + with EpcParser(minimal_epc) as parser: + rels_files = parser.find_all_rels_files() + assert len(rels_files) > 0 + assert "_rels/.rels" in rels_files + + +class TestEpcValidator: + """Test EPC validator functionality.""" + + @pytest.fixture + def valid_epc(self) -> io.BytesIO: + """Create valid EPC file for testing.""" + buffer = io.BytesIO() + + # Create content types + content_types = Types( + default=[ + Default(extension="rels", content_type="application/vnd.openxmlformats-package.relationships+xml") + ], + override=[ + Override( + part_name="/docProps/core.xml", + content_type="application/vnd.openxmlformats-package.core-properties+xml", + ), + Override( + part_name="/resqml/obj_BoundaryFeature_12345.xml", + content_type="application/x-resqml+xml;version=2.0;type=obj_BoundaryFeature", + ), + ], + ) + + # Create core properties + core_props = CoreProperties( + created=Created(any_element="2024-01-01T00:00:00Z"), + creator=Creator(any_element="Test Creator"), + identifier=Identifier(any_element="test-identifier"), + ) + + # Create root relationships + root_rels = Relationships( + relationship=[ + Relationship( + id="CoreProperties", + type_value="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties", + target="docProps/core.xml", + ) + ] + ) + + # Create ZIP file + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", serialize_xml(content_types)) + zf.writestr("_rels/.rels", serialize_xml(root_rels)) + zf.writestr("docProps/core.xml", serialize_xml(core_props)) + zf.writestr("resqml/obj_BoundaryFeature_12345.xml", "<test/>") + + buffer.seek(0) + return buffer + + def test_validate_valid_epc(self, valid_epc): + """Test validation of valid EPC file.""" + result = validate_epc_file(valid_epc) + assert result.is_valid is True + assert len(result.errors) == 0 + + def test_validator_initialization(self, valid_epc): + """Test validator initialization.""" + validator = EpcValidator(valid_epc) + assert validator.epc_path == valid_epc + assert validator.strict is True + assert validator.check_relationships is True + + def test_validate_with_strict_mode(self, valid_epc): + """Test validation in strict mode.""" + validator = EpcValidator(valid_epc, strict=True) + result = validator.validate() + assert result is not None + + def test_validate_with_lenient_mode(self, valid_epc): + """Test validation in lenient mode.""" + validator = EpcValidator(valid_epc, strict=False) + result = validator.validate() + assert result is not None + + def test_validate_missing_content_types(self): + """Test validation fails when [Content_Types].xml is missing.""" + buffer = io.BytesIO() + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("_rels/.rels", "<Relationships/>") + buffer.seek(0) + + result = validate_epc_file(buffer) + assert result.is_valid is False + assert any("Content_Types" in error for error in result.errors) + + def test_validate_missing_root_rels(self): + """Test validation fails when _rels/.rels is missing.""" + buffer = io.BytesIO() + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf: + content_types = Types( + default=[ + Default(extension="rels", content_type="application/vnd.openxmlformats-package.relationships+xml") + ] + ) + zf.writestr("[Content_Types].xml", serialize_xml(content_types)) + buffer.seek(0) + + result = validate_epc_file(buffer) + assert result.is_valid is False + assert any("_rels/.rels" in error for error in result.errors) + + def test_validate_invalid_zip(self): + """Test validation fails for invalid ZIP file.""" + buffer = io.BytesIO(b"This is not a ZIP file") + result = validate_epc_file(buffer) + assert result.is_valid is False + + def test_validate_relationships_missing_target(self): + """Test validation detects missing relationship targets.""" + buffer = io.BytesIO() + + content_types = Types( + default=[Default(extension="rels", content_type="application/vnd.openxmlformats-package.relationships+xml")] + ) + + # Relationship pointing to non-existent file + root_rels = Relationships( + relationship=[ + Relationship( + id="Missing", + type_value="http://schemas.example.org/test", + target="missing_file.xml", + ) + ] + ) + + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", serialize_xml(content_types)) + zf.writestr("_rels/.rels", serialize_xml(root_rels)) + + buffer.seek(0) + + result = validate_epc_file(buffer) + assert result.is_valid is False + assert any("not found" in error.lower() for error in result.errors) + + def test_validate_content_type_rels_default(self, valid_epc): + """Test validation checks .rels content type.""" + result = validate_epc_file(valid_epc) + # Should have info about .rels content type + assert result is not None + + def test_validate_core_properties_missing_fields(self): + """Test validation warns about missing core properties fields.""" + buffer = io.BytesIO() + + content_types = Types( + default=[ + Default(extension="rels", content_type="application/vnd.openxmlformats-package.relationships+xml") + ], + override=[ + Override( + part_name="/docProps/core.xml", + content_type="application/vnd.openxmlformats-package.core-properties+xml", + ) + ], + ) + + # Core properties with minimal fields + core_props = CoreProperties() + + root_rels = Relationships( + relationship=[ + Relationship( + id="CoreProperties", + type_value="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties", + target="docProps/core.xml", + ) + ] + ) + + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", serialize_xml(content_types)) + zf.writestr("_rels/.rels", serialize_xml(root_rels)) + zf.writestr("docProps/core.xml", serialize_xml(core_props)) + + buffer.seek(0) + + result = validate_epc_file(buffer, strict=False) + # Should have warnings about missing fields + assert len(result.warnings) > 0 + + def test_validate_naming_invalid_characters(self): + """Test validation detects invalid characters in file names.""" + buffer = io.BytesIO() + + content_types = Types( + default=[Default(extension="rels", content_type="application/vnd.openxmlformats-package.relationships+xml")] + ) + + root_rels = Relationships(relationship=[]) + + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", serialize_xml(content_types)) + zf.writestr("_rels/.rels", serialize_xml(root_rels)) + # This would be difficult to create in a real ZIP, but we can test the logic + + buffer.seek(0) + + result = validate_epc_file(buffer) + # Basic validation should pass + assert result is not None + + def test_validate_without_relationships_check(self, valid_epc): + """Test validation with relationship checking disabled.""" + result = validate_epc_file(valid_epc, check_relationships=False) + assert result is not None + + def test_validate_energyml_content_type_detection(self, valid_epc): + """Test detection of Energyml content types.""" + result = validate_epc_file(valid_epc) + # Should detect the resqml object + assert any("Energyml objects" in info for info in result.info) + + +class TestEpcValidatorWithRealFile: + """Test EPC validator with real EPC files.""" + + def test_validate_sample_epc_if_exists(self): + """Test validation with actual sample EPC file if available.""" + sample_paths = [ + Path("D:/Geosiris/OSDU/manifestTranslation/commons/data/testingPackageCpp.epc"), + Path("rc/epc/test.epc"), + Path("example/result/test.epc"), + ] + + sample_file = None + for path in sample_paths: + if path.exists(): + sample_file = path + break + + if sample_file is None: + pytest.skip("No sample EPC file available for testing") + + result = validate_epc_file(str(sample_file)) + # Real EPC files should generally be valid + print(f"\nValidation result for {sample_file}:") + print(result) + + +class TestEpcValidatorEdgeCases: + """Test edge cases and error handling.""" + + def test_validate_empty_relationships(self): + """Test validation with empty relationships file.""" + buffer = io.BytesIO() + + content_types = Types( + default=[Default(extension="rels", content_type="application/vnd.openxmlformats-package.relationships+xml")] + ) + + # Empty relationships + root_rels = Relationships(relationship=[]) + + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", serialize_xml(content_types)) + zf.writestr("_rels/.rels", serialize_xml(root_rels)) + + buffer.seek(0) + + result = validate_epc_file(buffer) + print(f"\nErrors: {result.errors}") + print(f"Warnings: {result.warnings}") + # Should warn about empty relationships or pass in strict=False + assert any("empty" in warning.lower() for warning in result.warnings) or not result.is_valid + + def test_validate_malformed_xml(self): + """Test validation with malformed XML.""" + buffer = io.BytesIO() + + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", "This is not valid XML") + zf.writestr("_rels/.rels", "<Relationships/>") + + buffer.seek(0) + + result = validate_epc_file(buffer) + assert result.is_valid is False + + def test_validate_relationship_without_id(self): + """Test validation detects relationships without ID.""" + buffer = io.BytesIO() + + content_types = Types( + default=[ + Default(extension="rels", content_type="application/vnd.openxmlformats-package.relationships+xml") + ], + override=[ + Override( + part_name="/docProps/core.xml", + content_type="application/vnd.openxmlformats-package.core-properties+xml", + ) + ], + ) + + # Core props to avoid that error + core_props = CoreProperties( + created=Created(any_element="2024-01-01T00:00:00Z"), + creator=Creator(any_element="Test"), + ) + + # Relationship without ID (not valid per spec) + root_rels = Relationships( + relationship=[ + Relationship( + id="", # Empty ID + type_value="http://schemas.example.org/test", + target="test.xml", + ) + ] + ) + + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", serialize_xml(content_types)) + zf.writestr("_rels/.rels", serialize_xml(root_rels)) + zf.writestr("docProps/core.xml", serialize_xml(core_props)) + + buffer.seek(0) + + result = validate_epc_file(buffer) + print(f"\nErrors: {result.errors}") + print(f"Warnings: {result.warnings}") + assert result.is_valid is False + assert any("missing" in error.lower() and "id" in error.lower() for error in result.errors) + + def test_validate_external_relationship(self): + """Test validation handles external relationships correctly.""" + buffer = io.BytesIO() + + content_types = Types( + default=[Default(extension="rels", content_type="application/vnd.openxmlformats-package.relationships+xml")] + ) + + # External relationship (should not check if target exists) + root_rels = Relationships( + relationship=[ + Relationship( + id="External", + type_value="http://schemas.example.org/external", + target="http://example.com/resource", + target_mode=TargetMode.EXTERNAL, + ) + ] + ) + + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", serialize_xml(content_types)) + zf.writestr("_rels/.rels", serialize_xml(root_rels)) + + buffer.seek(0) + + result = validate_epc_file(buffer) + # External relationships should not cause "target not found" errors + assert not any("http://example.com" in error for error in result.errors) + + +class TestValidationIntegration: + """Integration tests for complete validation workflows.""" + + def test_full_validation_workflow(self): + """Test complete validation workflow.""" + # Create a comprehensive EPC file + buffer = io.BytesIO() + + content_types = Types( + default=[ + Default(extension="rels", content_type="application/vnd.openxmlformats-package.relationships+xml") + ], + override=[ + Override( + part_name="/docProps/core.xml", + content_type="application/vnd.openxmlformats-package.core-properties+xml", + ), + Override( + part_name="/resqml/obj_TriangulatedSetRepresentation_uuid1.xml", + content_type="application/x-resqml+xml;version=2.2;type=obj_TriangulatedSetRepresentation", + ), + ], + ) + + core_props = CoreProperties( + created=Created(any_element="2024-01-01T00:00:00Z"), + creator=Creator(any_element="Test Integration"), + identifier=Identifier(any_element="integration-test-id"), + ) + + root_rels = Relationships( + relationship=[ + Relationship( + id="CoreProperties", + type_value="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties", + target="docProps/core.xml", + ), + Relationship( + id="Object1", + type_value="http://schemas.energistics.org/package/2012/relationships/destinationObject", + target="resqml/obj_TriangulatedSetRepresentation_uuid1.xml", + ), + ] + ) + + obj_rels = Relationships(relationship=[]) + + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", serialize_xml(content_types)) + zf.writestr("_rels/.rels", serialize_xml(root_rels)) + zf.writestr("docProps/core.xml", serialize_xml(core_props)) + zf.writestr("resqml/obj_TriangulatedSetRepresentation_uuid1.xml", "<TriangulatedSetRepresentation/>") + zf.writestr("resqml/_rels/obj_TriangulatedSetRepresentation_uuid1.xml.rels", serialize_xml(obj_rels)) + + buffer.seek(0) + + # Test with different validation modes + result_strict = validate_epc_file(buffer, strict=True, check_relationships=True) + assert result_strict.is_valid is True + + buffer.seek(0) + result_lenient = validate_epc_file(buffer, strict=False, check_relationships=False) + assert result_lenient is not None + + def test_validation_result_formatting(self): + """Test validation result provides useful output.""" + buffer = io.BytesIO() + + # Create invalid EPC (missing required files) + with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("test.txt", "Invalid EPC") + + buffer.seek(0) + + result = validate_epc_file(buffer) + output = str(result) + + # Check output contains useful information + assert "FAILED" in output or "PASSED" in output + assert len(output) > 0 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/energyml-utils/tests/test_introspection.py b/energyml-utils/tests/test_introspection.py index 3fcb17f..24baaec 100644 --- a/energyml-utils/tests/test_introspection.py +++ b/energyml-utils/tests/test_introspection.py @@ -1,6 +1,30 @@ # Copyright (c) 2023-2024 Geosiris. # SPDX-License-Identifier: Apache-2.0 +""" +Test Suite for energyml.utils.introspection module + +This module contains comprehensive tests for introspection utilities used to +inspect, manipulate, and extract information from Energyml objects. +""" +from dataclasses import dataclass +from energyml.utils.epc_utils import MimeType, as_dor +import pytest +from typing import Any + import energyml.resqml.v2_0_1.resqmlv2 +from energyml.eml.v2_0.commonv2 import Citation as Citation20 +from energyml.eml.v2_3.commonv2 import Citation, ExternalDataArrayPart, DataObjectReference +from energyml.resqml.v2_0_1.resqmlv2 import FaultInterpretation +from energyml.resqml.v2_2.resqmlv2 import ( + TriangulatedSetRepresentation, + TrianglePatch, + ContactElement, + IntegerExternalArray, + ExternalDataArray, + PointGeometry, + Point3DExternalArray, + AbstractPoint3DArray, +) from energyml.opc.opc import Dcmitype1, Contributor from src.energyml.utils.constants import ( @@ -9,19 +33,197 @@ epoch, epoch_to_date, snake_case, + gen_uuid, ) from src.energyml.utils.introspection import ( is_primitive, is_enum, + is_abstract, get_class_from_name, get_class_from_content_type, + get_class_from_qualified_type, get_object_attribute, + get_object_attribute_no_verif, + get_object_attribute_rgx, + get_object_attribute_advanced, set_attribute_from_path, copy_attributes, + get_obj_identifier, + get_obj_uuid, + get_obj_version, + get_obj_title, + get_obj_pkg_pkgv_type_uuid_version, + get_obj_uri, + get_obj_type, + get_obj_qualified_type, + get_obj_content_type, + get_class_methods, + get_class_fields, + get_class_attributes, + get_class_attribute_type, + get_matching_class_attribute_name, + search_attribute_matching_name, + search_attribute_matching_type, + set_attribute_from_json_str, + set_attribute_from_dict, + get_module_name, + class_match_rgx, + is_dor, + get_dor_obj_info, + get_direct_dor_list, ) +# ============================================================================= +# TEST FIXTURES - Reusable test data +# ============================================================================= + +# Sample nested dictionary for attribute access tests +SAMPLE_NESTED_DICT = {"a": {"b": ["v_x", {"c": "v_test"}]}} + +# Sample data for copy_attributes tests +SAMPLE_DATA_IN = { + "a": {"b": "v_0", "c": "v_1"}, + "uuid": "215f8219-cabd-4e24-9e4f-e371cabc9622", + "objectVersion": "Resqml 2.0", + "non_existing": 42, +} + +SAMPLE_DATA_OUT_TEMPLATE = { + "a": None, + "Uuid": "8291afd6-ae01-49f5-bc96-267e7b27450d", + "object_version": "Resqml 2.0", +} + + +@pytest.fixture +def citation_v20(): + """Create a Citation v2.0 object for testing.""" + return Citation20( + title="Test Citation v2.0", + originator="Valentin", + creation=epoch_to_date(epoch()), + editor="test", + format="Geosiris", + last_update=epoch_to_date(epoch()), + ) + + +@pytest.fixture +def citation_v23(): + """Create a Citation v2.3 object for testing.""" + return Citation( + title="Test Citation v2.3", + originator="Valentin", + creation=epoch_to_date(epoch()), + editor="test", + format="Geosiris", + last_update=epoch_to_date(epoch()), + ) + + +@pytest.fixture +def fault_interpretation(citation_v20): + """Create a FaultInterpretation (resqml 2.0.1) object for testing.""" + return FaultInterpretation( + citation=citation_v20, + uuid=gen_uuid(), + object_version="0", + ) + + +@pytest.fixture +def triangulated_set_no_version(citation_v23, fault_interpretation): + """Create a TriangulatedSetRepresentation (resqml 2.2) without version.""" + trset_uuid = gen_uuid() + return TriangulatedSetRepresentation( + citation=citation_v23, + uuid=trset_uuid, + represented_object=as_dor(fault_interpretation), + triangle_patch=[ + TrianglePatch( + node_count=3, + triangles=IntegerExternalArray( + values=ExternalDataArray( + external_data_array_part=[ + ExternalDataArrayPart( + count=[6], + path_in_external_file=f"/RESQML/{trset_uuid}/triangles", + uri="samplefile_uri.h5", + mime_type=str(MimeType.HDF5), + ) + ] + ) + ), + geometry=PointGeometry( + points=Point3DExternalArray( + coordinates=ExternalDataArray( + external_data_array_part=[ + ExternalDataArrayPart( + count=[9], + path_in_external_file=f"/RESQML/{trset_uuid}/points", + uri="samplefile_uri.h5", + mime_type=str(MimeType.HDF5), + ) + ] + ) + ), + ), + ) + ], + ) + + +@pytest.fixture +def triangulated_set_versioned(citation_v23, fault_interpretation): + """Create a TriangulatedSetRepresentation (resqml 2.2) with version.""" + trset_uuid = gen_uuid() + return TriangulatedSetRepresentation( + citation=citation_v23, + uuid=trset_uuid, + represented_object=as_dor(fault_interpretation), + object_version="3", + triangle_patch=[ + TrianglePatch( + node_count=3, + triangles=IntegerExternalArray( + values=ExternalDataArray( + external_data_array_part=[ + ExternalDataArrayPart( + count=[6], + path_in_external_file=f"/RESQML/{trset_uuid}/triangles", + uri="samplefile_uri.h5", + mime_type=str(MimeType.HDF5), + ) + ] + ) + ), + geometry=PointGeometry( + points=Point3DExternalArray( + coordinates=ExternalDataArray( + external_data_array_part=[ + ExternalDataArrayPart( + count=[9], + path_in_external_file=f"/RESQML/{trset_uuid}/points", + uri="samplefile_uri.h5", + mime_type=str(MimeType.HDF5), + ) + ] + ) + ), + ), + ) + ], + ) + + +# ============================================================================= +# TYPE CHECKING TESTS +# ============================================================================= + + def test_is_primitive(): + """Test identification of primitive types.""" assert is_primitive(1) assert is_primitive(int) assert is_primitive(float) @@ -33,22 +235,34 @@ def test_is_primitive(): def test_is_enum(): + """Test identification of Enum types.""" assert is_enum(Dcmitype1) assert not is_enum(Contributor) assert not is_enum(int) -def test_get_class_from_name(): - assert get_class_from_name("energyml.opc.opc.Dcmitype1") == Dcmitype1 +def test_is_abstract(): + """Test identification of abstract classes.""" + + assert is_abstract(AbstractPoint3DArray) + assert not is_abstract(Point3DExternalArray) + assert not is_abstract(int) + + +# ============================================================================= +# STRING CASE CONVERSION TESTS +# ============================================================================= def test_snake_case(): + """Test conversion to snake_case.""" assert snake_case("ThisIsASnakecase") == "this_is_a_snakecase" assert snake_case("This_IsASnakecase") == "this_is_a_snakecase" assert snake_case("This_isASnakecase") == "this_is_a_snakecase" def test_pascal_case(): + """Test conversion to PascalCase.""" assert pascal_case("ThisIsASnakecase") == "ThisIsASnakecase" assert pascal_case("This_IsASnakecase") == "ThisIsASnakecase" assert pascal_case("This_isASnakecase") == "ThisIsASnakecase" @@ -56,23 +270,206 @@ def test_pascal_case(): def test_epoch(): + """Test epoch time conversion utilities.""" now = epoch() assert date_to_epoch(epoch_to_date(now)) == now +# ============================================================================= +# CLASS RESOLUTION TESTS +# ============================================================================= + + +def test_get_class_from_name(): + """Test class resolution from fully qualified name.""" + assert get_class_from_name("energyml.opc.opc.Dcmitype1") == Dcmitype1 + + def test_get_class_from_content_type(): + """Test class resolution from content type string.""" found_type = get_class_from_content_type("resqml20.obj_Grid2dRepresentation") assert found_type is not None assert found_type == energyml.resqml.v2_0_1.resqmlv2.Grid2DRepresentation +def test_get_class_from_qualified_type(): + """Test class resolution from qualified type string. + + According to the docstring: Return a type object matching with the qualified-type. + This is similar to get_class_from_content_type. + """ + assert get_class_from_qualified_type("resqml22.TriangulatedSetRepresentation") == TriangulatedSetRepresentation + assert get_class_from_qualified_type("resqml20.obj_FaultInterpretation") == FaultInterpretation + + +def test_get_module_name(): + """Test module name generation from domain and version. + + According to the function signature: get_module_name(domain: str, domain_version: str) + """ + assert get_module_name("resqml", "2.0") == "energyml.resqml.v2_0.resqmlv2" + assert get_module_name("eml", "2.3") == "energyml.eml.v2_3.commonv2" + assert get_module_name("eml", "2.0") == "energyml.eml.v2_0.commonv2" + assert get_module_name("witsml", "1.0") == "energyml.witsml.v1_0.witsmlv2" + + +# ============================================================================= +# CLASS INTROSPECTION TESTS +# ============================================================================= + + +def test_get_class_methods(): + """Test retrieval of class methods. + + According to the docstring: Returns the list of the methods names for a specific class. + """ + + class SampleClass: + def method_one(self): + pass + + def method_two(self): + pass + + def __str__(self): + return "SampleClass" + + methods = get_class_methods(SampleClass) + assert isinstance(methods, list) + # Methods should not include dunder methods or types + for method in methods: + assert not method.startswith("__") + + assert len(methods) == 2 + assert "method_one" in methods + assert "method_two" in methods + + +def test_get_class_fields(): + """Test retrieval of class fields. + + According to the docstring: Return all class fields names, mapped to their Field value. + If a dict is given, this function is the identity. + """ + # Test with dict (identity function) + test_dict = {"a": 1, "b": 2} + assert get_class_fields(test_dict) == test_dict + + # Test with actual class + fields = get_class_fields(Citation) + + official_fields = { + "title", + "originator", + "creation", + "format", + "editor", + "last_update", + "description", + "editor_history", + "descriptive_keywords", + } + + assert isinstance(fields, dict) + # Should contain expected fields assert "title" in fields + assert len(fields) == len(official_fields) + assert set(fields.keys()) == official_fields + + +def test_get_class_attributes(): + """Test retrieval of class attributes. + + According to the docstring: returns a list of attributes (not private ones). + """ + + class SampleClass: + class_attr = "value" + _private_attr = "private" + + def __init__(self): + self.additional_attr = "additional" + + def method_one(self): + pass + + attributes = get_class_attributes(SampleClass) + assert isinstance(attributes, list) + + assert len(attributes) == 1 + assert "class_attr" in attributes + + +def test_get_class_attribute_type(): + """Test retrieval of attribute type from class.""" + citation_title_type = get_class_attribute_type(Citation, "title") + assert str(citation_title_type) == "Optional[str]" + + citation_editor_history_type = get_class_attribute_type(Citation, "editor_history") + assert str(citation_editor_history_type) == "List[str]" + + +def test_get_matching_class_attribute_name(citation_v23): + """Test finding correct attribute name from class.""" + # Test with case-insensitive matching + result = get_matching_class_attribute_name(citation_v23, "Title") + assert result == "title" + + result = get_matching_class_attribute_name(citation_v23, "ORIGINATOR") + assert result == "originator" + + +# ============================================================================= +# OBJECT ATTRIBUTE ACCESS TESTS +# ============================================================================= + + def test_get_object_attribute(): - data = {"a": {"b": ["v_x", {"c": "v_test"}]}} + """Test attribute access via dot-notation path.""" + data = SAMPLE_NESTED_DICT.copy() assert get_object_attribute(data, "a.b.1.c") == "v_test" +def test_get_object_attribute_no_verif(): + """Test attribute access without verification.""" + data = SAMPLE_NESTED_DICT.copy() + + # Test with dict + assert get_object_attribute_no_verif(data, "a") is not None + + # Test with list indexing + assert get_object_attribute_no_verif(data["a"]["b"], "0") == "v_x" + + # Test that non-existent attribute raises exception (no verification) + with pytest.raises(AttributeError): + get_object_attribute_no_verif(data, "non_existent") + + +def test_get_object_attribute_rgx(triangulated_set_versioned): + """Test attribute access using regex patterns. + + According to the docstring: Search the attribute name using regex for values between dots. + Example: [Cc]itation.[Tt]it\\.* + """ + + assert get_object_attribute_rgx(triangulated_set_versioned, "Citation.Title") == "Test Citation v2.3" + assert get_object_attribute_rgx(triangulated_set_versioned, "[Cc]itation.[Tt]it\\.*") == "Test Citation v2.3" + assert get_object_attribute_rgx(triangulated_set_versioned, "[Cc]itation.[Oo]rigin\\.*") == "Valentin" + + +def test_get_object_attribute_advanced(triangulated_set_versioned): + """Test advanced attribute access with matching.""" + assert get_object_attribute_advanced(triangulated_set_versioned, "citation.title") == "Test Citation v2.3" + assert get_object_attribute_advanced(triangulated_set_versioned, "citation.originator") == "Valentin" + + +# ============================================================================= +# OBJECT ATTRIBUTE MODIFICATION TESTS +# ============================================================================= + + def test_set_attribute_from_path(): - data = {"a": {"b": ["v_x", {"c": "v_test"}]}} + """Test setting attribute value via dot-notation path.""" + data = SAMPLE_NESTED_DICT.copy() assert get_object_attribute(data, "a.b.1.c") == "v_test" set_attribute_from_path(data, "a.b.1.c", "v_new") assert get_object_attribute(data, "a.b.1.c") == "v_new" @@ -80,18 +477,40 @@ def test_set_attribute_from_path(): assert get_object_attribute(data, "a") == "v_new" +def test_set_attribute_from_json_str(): + """Test setting attributes from JSON string. + + According to signature: set_attribute_from_json_str(obj: Any, json_input: str) -> None + """ + d_0 = {"a": "v_0", "b": {"c": "v_1"}} + d_1 = '{"a": "coucou"}' + + set_attribute_from_json_str(d_0, d_1) + assert d_0["a"] == "coucou" + + d_3 = '{"b": {"c": "v_2"}}' + set_attribute_from_json_str(d_0, d_3) + assert d_0["b"]["c"] == "v_2" + + +def test_set_attribute_from_dict(): + """Test setting attributes from dictionary.""" + d_0 = {"a": "v_0", "b": {"c": "v_1"}} + d_1 = {"a": "coucou"} + + set_attribute_from_dict(d_0, d_1) + assert d_0["a"] == "coucou" + + d_3 = {"b": {"c": "v_2"}} + set_attribute_from_dict(d_0, d_3) + assert d_0["b"]["c"] == "v_2" + + def test_copy_attributes_existing_ignore_case(): - data_in = { - "a": {"b": "v_0", "c": "v_1"}, - "uuid": "215f8219-cabd-4e24-9e4f-e371cabc9622", - "objectVersion": "Resqml 2.0", - "non_existing": 42, - } - data_out = { - "a": None, - "Uuid": "8291afd6-ae01-49f5-bc96-267e7b27450d", - "object_version": "Resqml 2.0", - } + """Test copying only existing attributes with case-insensitive matching.""" + data_in = SAMPLE_DATA_IN.copy() + data_out = SAMPLE_DATA_OUT_TEMPLATE.copy() + copy_attributes( obj_in=data_in, obj_out=data_out, @@ -105,17 +524,10 @@ def test_copy_attributes_existing_ignore_case(): def test_copy_attributes_ignore_case(): - data_in = { - "a": {"b": "v_0", "c": "v_1"}, - "uuid": "215f8219-cabd-4e24-9e4f-e371cabc9622", - "objectVersion": "Resqml 2.0", - "non_existing": 42, - } - data_out = { - "a": None, - "Uuid": "8291afd6-ae01-49f5-bc96-267e7b27450d", - "object_version": "Resqml 2.0", - } + """Test copying all attributes with case-insensitive matching.""" + data_in = SAMPLE_DATA_IN.copy() + data_out = SAMPLE_DATA_OUT_TEMPLATE.copy() + copy_attributes( obj_in=data_in, obj_out=data_out, @@ -129,17 +541,10 @@ def test_copy_attributes_ignore_case(): def test_copy_attributes_case_sensitive(): - data_in = { - "a": {"b": "v_0", "c": "v_1"}, - "uuid": "215f8219-cabd-4e24-9e4f-e371cabc9622", - "objectVersion": "Resqml 2.0", - "non_existing": 42, - } - data_out = { - "a": None, - "Uuid": "8291afd6-ae01-49f5-bc96-267e7b27450d", - "object_version": "Resqml 2.0", - } + """Test copying attributes with case-sensitive matching.""" + data_in = SAMPLE_DATA_IN.copy() + data_out = SAMPLE_DATA_OUT_TEMPLATE.copy() + copy_attributes( obj_in=data_in, obj_out=data_out, @@ -150,3 +555,283 @@ def test_copy_attributes_case_sensitive(): assert data_out["Uuid"] != data_in["uuid"] assert data_out["object_version"] == data_in["objectVersion"] assert data_out["non_existing"] == data_in["non_existing"] + + +# ============================================================================= +# ATTRIBUTE SEARCH TESTS +# ============================================================================= + + +def test_search_attribute_matching_name(triangulated_set_versioned): + """Test searching attributes by name pattern.""" + assert len(search_attribute_matching_name(triangulated_set_versioned, "title", search_in_sub_obj=False)) == 0 + title_deep = search_attribute_matching_name(triangulated_set_versioned, "title", search_in_sub_obj=True) + assert len(title_deep) == 2 + assert triangulated_set_versioned.citation.title in title_deep + assert triangulated_set_versioned.represented_object.title in title_deep + + +def test_search_attribute_matching_type(triangulated_set_versioned): + """Test searching attributes by type pattern.""" + search_results_deep = search_attribute_matching_type( + triangulated_set_versioned, type_rgx="ExternalDataArrayPart", deep_search=True + ) + assert len(search_results_deep) == 2 + + @dataclass + class SampleClass: + ci: ContactElement + dor: DataObjectReference + + s = SampleClass( + ci=ContactElement(uuid="007"), + dor=DataObjectReference(uuid="008"), + ) + + search_result_citation = search_attribute_matching_type(s, type_rgx="DataObjectReference", super_class_search=False) + assert len(search_result_citation) == 1 + search_result_citation_deep = search_attribute_matching_type( + s, type_rgx="DataObjectReference", super_class_search=True + ) + assert len(search_result_citation_deep) == 2 + + assert len(search_attribute_matching_type(s, type_rgx="SampleClass", return_self=True)) == 1 + assert len(search_attribute_matching_type(s, type_rgx="SampleClass", return_self=False)) == 0 + + +# ============================================================================= +# OBJECT METADATA EXTRACTION TESTS +# ============================================================================= + + +def test_get_obj_uuid(triangulated_set_no_version, fault_interpretation): + """Test extracting object UUID. + + According to the docstring: Return the object uuid (attribute must match + the following regex: "[Uu]u?id|UUID"). + """ + assert get_obj_uuid(triangulated_set_no_version) == triangulated_set_no_version.uuid + assert get_obj_uuid(fault_interpretation) == fault_interpretation.uuid + + +def test_get_obj_version(triangulated_set_versioned, triangulated_set_no_version, fault_interpretation): + """Test extracting object version. + + According to the docstring: Return the object version (check for "object_version" + or "version_string" attribute). + """ + # Test object with explicit version + assert get_obj_version(triangulated_set_versioned) == "3" + assert get_obj_version(fault_interpretation) == "0" + + # Test object without version + version = get_obj_version(triangulated_set_no_version) + assert version is None or version == "" + + +def test_get_obj_version_edge_cases(): + """Test get_obj_version handles missing attributes gracefully.""" + + # Create object with only some version attributes + class MockObjWithVersionString: + version_string = "1.0" + + class MockObjWithCitationVersion: + class Citation: + version_string = "2.0" + + citation = Citation() + + class MockObjNoVersion: + some_other_attr = "value" + + # Should find version_string + assert get_obj_version(MockObjWithVersionString()) == "1.0" + + # Should find citation.version_string when object_version missing + assert get_obj_version(MockObjWithCitationVersion()) == "2.0" + + # Should return None when no version found + assert get_obj_version(MockObjNoVersion()) is None + + +def test_get_obj_title(triangulated_set_no_version, fault_interpretation): + """Test extracting object title.""" + assert get_obj_title(triangulated_set_no_version) == "Test Citation v2.3" + assert get_obj_title(fault_interpretation) == "Test Citation v2.0" + assert get_obj_title(as_dor(fault_interpretation)) == "Test Citation v2.0" + + class MockObjWithTitle: + name = "Mock Title" + + assert get_obj_title(MockObjWithTitle()) == "Mock Title" + + assert get_obj_title({"Title": "Dict Title"}) == "Dict Title" + assert get_obj_title({"title": "Dict Title Lower"}) == "Dict Title Lower" + assert get_obj_title({"what": 42}) is None + assert get_obj_title({"name": "Dict Title Lower"}) == "Dict Title Lower" + + # priority to citation.title + assert get_obj_title({"name": "Dict Title Lower", "citation": {"title": "Citation Title"}}) == "Citation Title" + + +def test_get_obj_type(triangulated_set_no_version, fault_interpretation): + """Test extracting object type name.""" + assert get_obj_type(triangulated_set_no_version) == "TriangulatedSetRepresentation" + assert get_obj_type(fault_interpretation) == "FaultInterpretation" + + # Test with type itself + assert get_obj_type(TriangulatedSetRepresentation) == "TriangulatedSetRepresentation" + + +def test_get_obj_identifier(triangulated_set_no_version, triangulated_set_versioned, fault_interpretation): + """Test object identifier generation (UUID.VERSION format).""" + assert get_obj_identifier(triangulated_set_no_version) == triangulated_set_no_version.uuid + "." + assert get_obj_identifier(fault_interpretation) == fault_interpretation.uuid + ".0" + assert get_obj_identifier(triangulated_set_versioned) == triangulated_set_versioned.uuid + ".3" + + +def test_get_obj_pkg_pkgv_type_uuid_version_obj_201(fault_interpretation): + """Test metadata extraction from resqml20 object.""" + domain, domain_version, object_type, obj_uuid, obj_version = get_obj_pkg_pkgv_type_uuid_version( + fault_interpretation + ) + + assert domain == "resqml" + assert domain_version == "20" + assert object_type == "obj_FaultInterpretation" + assert obj_uuid == fault_interpretation.uuid + assert obj_version == fault_interpretation.object_version + + +def test_get_obj_pkg_pkgv_type_uuid_version_obj_22(triangulated_set_no_version): + """Test metadata extraction from resqml22 object.""" + domain, domain_version, object_type, obj_uuid, obj_version = get_obj_pkg_pkgv_type_uuid_version( + triangulated_set_no_version + ) + + assert domain == "resqml" + assert domain_version == "22" + assert object_type == "TriangulatedSetRepresentation" + assert obj_uuid == triangulated_set_no_version.uuid + assert obj_version == triangulated_set_no_version.object_version + + +def test_get_obj_qualified_type(triangulated_set_no_version, fault_interpretation): + """Test qualified type generation. + + According to the docstring: Generates an object qualified type as: 'PKG.PKG_VERSION.OBJ_TYPE'. + """ + assert "resqml22.TriangulatedSetRepresentation" == get_obj_qualified_type(triangulated_set_no_version) + assert "resqml20.obj_FaultInterpretation" == get_obj_qualified_type(fault_interpretation) + + +def test_get_obj_content_type(triangulated_set_no_version, fault_interpretation): + """Test content type generation from object.""" + expected_content_type = "application/x-resqml+xml;version=2.2;type=TriangulatedSetRepresentation" + assert get_obj_content_type(triangulated_set_no_version) == expected_content_type + + expected_content_type_fi = "application/x-resqml+xml;version=2.0;type=obj_FaultInterpretation" + assert get_obj_content_type(fault_interpretation) == expected_content_type_fi + + +def test_get_obj_uri(triangulated_set_no_version, fault_interpretation): + """Test URI generation for energyml objects.""" + uri_str = str(get_obj_uri(triangulated_set_no_version)) + assert uri_str == f"eml:///resqml22.TriangulatedSetRepresentation({triangulated_set_no_version.uuid})" + assert ( + str(get_obj_uri(as_dor(triangulated_set_no_version))) + == f"eml:///resqml22.TriangulatedSetRepresentation({triangulated_set_no_version.uuid})" + ) + + uri_str_with_dataspace = str(get_obj_uri(triangulated_set_no_version, "/MyDataspace/")) + assert ( + uri_str_with_dataspace + == f"eml:///dataspace('/MyDataspace/')/resqml22.TriangulatedSetRepresentation({triangulated_set_no_version.uuid})" + ) + + uri_str_fi = str(get_obj_uri(fault_interpretation)) + assert ( + uri_str_fi + == f"eml:///resqml20.obj_FaultInterpretation(uuid={fault_interpretation.uuid},version='{fault_interpretation.object_version}')" + ) + + uri_str_fi_dataspace = str(get_obj_uri(fault_interpretation, "/MyDataspace/")) + assert ( + uri_str_fi_dataspace + == f"eml:///dataspace('/MyDataspace/')/resqml20.obj_FaultInterpretation(uuid={fault_interpretation.uuid},version='{fault_interpretation.object_version}')" + ) + + +# ============================================================================= +# DATA OBJECT REFERENCE (DOR) TESTS +# ============================================================================= + + +def test_is_dor(triangulated_set_versioned): + """Test identification of Data Object Reference objects. + + According to the docstring: Returns True if the object is a DataObjectReference or + has ContentType/QualifiedType attributes. + """ + assert is_dor(as_dor(triangulated_set_versioned)) + assert not is_dor(triangulated_set_versioned) + assert is_dor( + { + "ContentType": "application/x-resqml+xml;version=2.2;type=RockVolumeFeature", + } + ) + assert is_dor( + { + "QualifiedType": "resqml22.TriangulatedSetRepresentation", + } + ) + assert not is_dor( + { + "what": 42, + } + ) + + +def test_get_dor_obj_info(triangulated_set_versioned): + """Test extracting information from DOR objects. + + According to the docstring: From a DOR object, return a tuple + (uuid, package name, package version, object_type, object_version). + """ + dor = as_dor(triangulated_set_versioned) + uuid, pkg_name, pkg_version, obj_type, obj_version = get_dor_obj_info(dor) + assert uuid == triangulated_set_versioned.uuid + assert pkg_name == "resqml" + assert pkg_version == "2.2" + assert obj_type == type(triangulated_set_versioned) + assert obj_version == triangulated_set_versioned.object_version + + +def test_get_direct_dor_list(triangulated_set_no_version): + """Test finding all DataObjectReference attributes. + + According to the docstring: Search all sub attribute of type "DataObjectReference". + """ + dor_list = get_direct_dor_list(triangulated_set_no_version) + assert isinstance(dor_list, list) + assert len(dor_list) == 1 + + +# ============================================================================= +# PATTERN MATCHING TESTS +# ============================================================================= + + +def test_class_match_rgx(): + """Test class name matching with regex. + + According to signature: class_match_rgx(cls, rgx, super_class_search, re_flags) + Tests if a class name matches a regex pattern. + """ + # Test simple class name matching + assert class_match_rgx(Contributor, "Contributor") + assert class_match_rgx(Contributor, "contrib.*") + + # Test case-insensitive matching (default behavior) + assert class_match_rgx(Contributor, "contributor") diff --git a/energyml-utils/tests/test_mesh_numpy.py b/energyml-utils/tests/test_mesh_numpy.py new file mode 100644 index 0000000..aa14491 --- /dev/null +++ b/energyml-utils/tests/test_mesh_numpy.py @@ -0,0 +1,574 @@ +"""Tests for the zero-copy numpy mesh reader (mesh_numpy.py). + +Covers: +* NumpyMesh dataclass field shapes/dtypes. +* crs_displacement_np — vectorised CRS offset + Z-flip. +* _ViewWorkspace — routing of read_array to read_array_view. +* HDF5ArrayHandler.read_array_view — best-effort zero-copy. +* End-to-end read_numpy_mesh_object for all supported representation types, + using the EPC/HDF5 fixtures already present in ``rc/epc/``. +* numpy_mesh_to_pyvista round-trip (requires pyvista; skipped otherwise). + +Run from the workspace root: + poetry run pytest tests/test_mesh_numpy.py -v +""" +import os +import tempfile +from typing import Optional +from unittest.mock import MagicMock + +import numpy as np +import pytest + +from energyml.utils.data.mesh_numpy import ( + NumpyMesh, + NumpyMultiMesh, + NumpyPointSetMesh, + NumpyPolylineMesh, + NumpySurfaceMesh, + NumpyVolumeMesh, + _ViewWorkspace, + _build_vtk_faces_from_triangles, + _build_vtk_faces_from_quads, + _build_vtk_lines_from_segments, + _ensure_float64_points, + crs_displacement_np, + read_numpy_mesh_object, + numpy_mesh_to_pyvista, + numpy_multi_mesh_to_pyvista, +) + +# --------------------------------------------------------------------------- +# Paths helpers +# --------------------------------------------------------------------------- + +_WORKSPACE_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +_EPC_DIR = os.path.join(_WORKSPACE_ROOT, "rc", "epc") +_EPC22 = os.path.join(_EPC_DIR, "testingPackageCpp22.epc") +_EPC20 = os.path.join(_EPC_DIR, "testingPackageCpp.epc") + + +def _epc22_available() -> bool: + return os.path.isfile(_EPC22) + + +def _epc20_available() -> bool: + return os.path.isfile(_EPC20) + + +# --------------------------------------------------------------------------- +# 1. Dataclass shape / dtype invariants +# --------------------------------------------------------------------------- + +class TestNumpyMeshDataclasses: + def test_point_set_defaults(self): + m = NumpyPointSetMesh() + assert m.points.shape == (0, 3) + assert m.points.dtype == np.float64 + + def test_surface_mesh_defaults(self): + m = NumpySurfaceMesh() + assert m.faces.dtype == np.int64 + assert m.faces.ndim == 1 + + def test_polyline_mesh_defaults(self): + m = NumpyPolylineMesh() + assert m.lines.dtype == np.int64 + + def test_volume_mesh_defaults(self): + m = NumpyVolumeMesh() + assert m.cells.dtype == np.int64 + assert m.cell_types.dtype == np.uint8 + + def test_surface_mesh_populated(self): + pts = np.array([[0, 0, 0], [1, 0, 0], [0, 1, 0]], dtype=np.float64) + faces = np.array([3, 0, 1, 2], dtype=np.int64) + m = NumpySurfaceMesh(points=pts, faces=faces) + assert m.points.shape == (3, 3) + assert m.faces[0] == 3 # VTK triangle count prefix + + +# --------------------------------------------------------------------------- +# 2. _ensure_float64_points +# --------------------------------------------------------------------------- + +class TestEnsureFloat64Points: + def test_flat_list(self): + a = _ensure_float64_points([1, 2, 3, 4, 5, 6]) + assert a.shape == (2, 3) + assert a.dtype == np.float64 + + def test_nested_list(self): + a = _ensure_float64_points([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + assert a.shape == (2, 3) + assert a.dtype == np.float64 + + def test_already_correct_array(self): + arr = np.zeros((5, 3), dtype=np.float64) + result = _ensure_float64_points(arr) + # Should return a view (same data, no copy) + assert result.shape == (5, 3) + assert result.dtype == np.float64 + + def test_wrong_col_count_raises(self): + with pytest.raises(ValueError): + _ensure_float64_points(np.zeros((4, 5))) # 5 cols is never valid + + def test_2d_points_padded_with_zeros(self): + a = _ensure_float64_points(np.array([[1.0, 2.0], [3.0, 4.0]])) + assert a.shape == (2, 3) + np.testing.assert_array_equal(a[:, 2], [0.0, 0.0]) + + +# --------------------------------------------------------------------------- +# 3. VTK connectivity builders +# --------------------------------------------------------------------------- + +class TestVTKBuilders: + def test_faces_from_triangles(self): + tri = np.array([[0, 1, 2], [1, 2, 3]], dtype=np.int64) + faces = _build_vtk_faces_from_triangles(tri) + expected = np.array([3, 0, 1, 2, 3, 1, 2, 3], dtype=np.int64) + np.testing.assert_array_equal(faces, expected) + + def test_faces_from_quads(self): + quad = np.array([[0, 1, 2, 3]], dtype=np.int64) + faces = _build_vtk_faces_from_quads(quad) + expected = np.array([4, 0, 1, 2, 3], dtype=np.int64) + np.testing.assert_array_equal(faces, expected) + + def test_lines_from_segments_3pts(self): + lines = _build_vtk_lines_from_segments(3) + # [2, 0, 1, 2, 1, 2] + expected = np.array([2, 0, 1, 2, 1, 2], dtype=np.int64) + np.testing.assert_array_equal(lines, expected) + + def test_lines_from_segments_1pt(self): + lines = _build_vtk_lines_from_segments(1) + assert len(lines) == 0 + + def test_lines_from_segments_0pts(self): + lines = _build_vtk_lines_from_segments(0) + assert len(lines) == 0 + + +# --------------------------------------------------------------------------- +# 4. crs_displacement_np +# --------------------------------------------------------------------------- + +class TestCrsDisplacementNp: + def _make_crs(self, x=0.0, y=0.0, z=0.0, z_reversed=False): + """Build a minimal mock CRS object.""" + from unittest.mock import patch + + crs = MagicMock() + + # Patch the helper functions used by crs_displacement_np + return crs, [x, y, z], z_reversed + + def test_offset_only(self): + """Test pure XYZ offset without Z reversal.""" + pts = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float64) + crs = MagicMock() + + # Patch helper functions at the module level + import energyml.utils.data.mesh_numpy as mn + orig_offset = mn.get_crs_origin_offset + orig_zrev = mn.is_z_reversed + try: + mn.get_crs_origin_offset = lambda crs_obj: [10.0, 20.0, 30.0] + mn.is_z_reversed = lambda crs_obj: False + result = crs_displacement_np(pts.copy(), crs) + finally: + mn.get_crs_origin_offset = orig_offset + mn.is_z_reversed = orig_zrev + + np.testing.assert_allclose(result, [[11.0, 22.0, 33.0], [14.0, 25.0, 36.0]]) + + def test_z_reversal(self): + """Test Z-axis inversion.""" + pts = np.array([[0.0, 0.0, 100.0]], dtype=np.float64) + crs = MagicMock() + + import energyml.utils.data.mesh_numpy as mn + orig_offset = mn.get_crs_origin_offset + orig_zrev = mn.is_z_reversed + try: + mn.get_crs_origin_offset = lambda crs_obj: [0.0, 0.0, 0.0] + mn.is_z_reversed = lambda crs_obj: True + result = crs_displacement_np(pts.copy(), crs) + finally: + mn.get_crs_origin_offset = orig_offset + mn.is_z_reversed = orig_zrev + + assert result[0, 2] == pytest.approx(-100.0) + + def test_inplace_false_no_mutation(self): + """inplace=False must not mutate the original array.""" + pts = np.array([[1.0, 2.0, 3.0]], dtype=np.float64) + original = pts.copy() + crs = MagicMock() + + import energyml.utils.data.mesh_numpy as mn + orig_offset = mn.get_crs_origin_offset + orig_zrev = mn.is_z_reversed + try: + mn.get_crs_origin_offset = lambda crs_obj: [1.0, 1.0, 1.0] + mn.is_z_reversed = lambda crs_obj: False + result = crs_displacement_np(pts, crs, inplace=False) + finally: + mn.get_crs_origin_offset = orig_offset + mn.is_z_reversed = orig_zrev + + np.testing.assert_array_equal(pts, original, err_msg="Source array was mutated despite inplace=False") + np.testing.assert_allclose(result, [[2.0, 3.0, 4.0]]) + + def test_none_crs_returns_unchanged(self): + pts = np.array([[1.0, 2.0, 3.0]], dtype=np.float64) + result = crs_displacement_np(pts, None) + np.testing.assert_array_equal(result, pts) + + +# --------------------------------------------------------------------------- +# 5. _ViewWorkspace +# --------------------------------------------------------------------------- + +class TestViewWorkspace: + def test_read_array_redirects_to_view(self): + """read_array on _ViewWorkspace should call read_array_view on the wrapped ws.""" + ws = MagicMock() + ws.read_array_view.return_value = np.array([1, 2, 3]) + ws.some_other_attr = "hello" + + view_ws = _ViewWorkspace(ws) + # read_array calls must be redirected + result = view_ws.read_array("proxy", "path/in/h5", None, None, None) + ws.read_array_view.assert_called_once_with("proxy", "path/in/h5", None, None, None) + np.testing.assert_array_equal(result, [1, 2, 3]) + + def test_other_attrs_forwarded(self): + ws = MagicMock() + ws.some_method.return_value = 42 + view_ws = _ViewWorkspace(ws) + assert view_ws.some_method() == 42 + + +# --------------------------------------------------------------------------- +# 6. HDF5ArrayHandler.read_array_view +# --------------------------------------------------------------------------- + +class TestHDF5ArrayHandlerReadArrayView: + """Verify zero-copy semantics of read_array_view vs read_array.""" + + @pytest.fixture + def h5_with_contiguous_dataset(self, tmp_path): + """Create a small HDF5 file with a contiguous (non-chunked) dataset.""" + h5py = pytest.importorskip("h5py") + fpath = str(tmp_path / "test_view.h5") + arr = np.arange(12, dtype=np.float64).reshape(4, 3) + with h5py.File(fpath, "w") as f: + # contiguous layout — default when no chunks specified + f.create_dataset("/pts", data=arr, chunks=None) + return fpath, arr + + def test_read_array_view_returns_correct_data(self, h5_with_contiguous_dataset): + from energyml.utils.data.datasets_io import HDF5ArrayHandler + fpath, expected = h5_with_contiguous_dataset + handler = HDF5ArrayHandler() + result = handler.read_array_view(fpath, "/pts") + handler.file_cache.close_all() + assert result is not None + np.testing.assert_allclose(result, expected) + + def test_read_array_view_is_ndarray(self, h5_with_contiguous_dataset): + from energyml.utils.data.datasets_io import HDF5ArrayHandler + fpath, _ = h5_with_contiguous_dataset + handler = HDF5ArrayHandler() + result = handler.read_array_view(fpath, "/pts") + handler.file_cache.close_all() + assert isinstance(result, np.ndarray) + + def test_subselection_correct(self, h5_with_contiguous_dataset): + from energyml.utils.data.datasets_io import HDF5ArrayHandler + fpath, expected = h5_with_contiguous_dataset + handler = HDF5ArrayHandler() + # Select rows 1 and 2 (start=1, count=2 along axis-0) + result = handler.read_array_view(fpath, "/pts", start_indices=[1, 0], counts=[2, 3]) + handler.file_cache.close_all() + np.testing.assert_allclose(result, expected[1:3]) + + def test_storage_interface_default_fallback(self): + """EnergymlStorageInterface.read_array_view must call read_array by default.""" + from energyml.utils.storage_interface import EnergymlStorageInterface + + class _Concrete(EnergymlStorageInterface): + """Minimal concrete subclass that does NOT override read_array_view.""" + def get_object(self, identifier): return None + def get_object_by_uuid(self, uuid): return [] + def put_object(self, obj, dataspace=None): return None + def delete_object(self, identifier): return False + def read_array(self, proxy, path, start=None, counts=None, uri=None): + return np.array([99.0]) + def write_array(self, *a, **kw): return False + def get_array_metadata(self, *a, **kw): return None + def list_objects(self, *a, **kw): return [] + def get_obj_rels(self, obj): return [] + def close(self): pass + + ws = _Concrete() + result = ws.read_array_view("p", "path") + np.testing.assert_array_equal(result, [99.0]) + + +# --------------------------------------------------------------------------- +# 7. End-to-end representation readers (require EPC fixtures) +# --------------------------------------------------------------------------- + +@pytest.mark.skipif(not _epc22_available(), reason="testingPackageCpp22.epc not found in rc/epc/") +class TestReadNumpyMeshObjectEPC22: + """Integration tests against testingPackageCpp22.epc.""" + + @pytest.fixture(scope="class") + def epc22(self): + from energyml.utils.epc import Epc + return Epc.read_file(_EPC22, read_rels_from_files=False, recompute_rels=False) + + # --- TriangulatedSetRepresentation --- + def test_triangulated_set_returns_surface_mesh(self, epc22): + obj = epc22.get_object_by_uuid("6e678338-3b53-49b6-8801-faee493e0c42") + if not obj: + pytest.skip("TriangulatedSet UUID not found in fixture EPC") + multi = read_numpy_mesh_object(obj[0], workspace=epc22) + assert isinstance(multi, NumpyMultiMesh) + patches = multi.flat_patches() + assert patches, "Expected at least one patch" + for m in patches: + assert isinstance(m, NumpySurfaceMesh) + + def test_triangulated_set_points_shape_dtype(self, epc22): + obj = epc22.get_object_by_uuid("6e678338-3b53-49b6-8801-faee493e0c42") + if not obj: + pytest.skip("TriangulatedSet UUID not found in fixture EPC") + multi = read_numpy_mesh_object(obj[0], workspace=epc22) + for m in multi.flat_patches(): + assert m.points.ndim == 2 + assert m.points.shape[1] == 3 + assert m.points.dtype == np.float64 + + def test_triangulated_set_faces_dtype_and_format(self, epc22): + obj = epc22.get_object_by_uuid("6e678338-3b53-49b6-8801-faee493e0c42") + if not obj: + pytest.skip("TriangulatedSet UUID not found in fixture EPC") + multi = read_numpy_mesh_object(obj[0], workspace=epc22) + for m in multi.flat_patches(): + assert isinstance(m, NumpySurfaceMesh) + assert m.faces.dtype == np.int64 + assert m.faces.ndim == 1 + # First element must be 3 (triangle) + assert m.faces[0] == 3, "VTK face array must start with face vertex count (3 for triangles)" + + def test_triangulated_set_no_lists(self, epc22): + """Guarantee no Python lists survive into the mesh dataclass.""" + obj = epc22.get_object_by_uuid("6e678338-3b53-49b6-8801-faee493e0c42") + if not obj: + pytest.skip("TriangulatedSet UUID not found in fixture EPC") + multi = read_numpy_mesh_object(obj[0], workspace=epc22) + for m in multi.flat_patches(): + assert isinstance(m.points, np.ndarray), "points must be ndarray" + assert isinstance(m.faces, np.ndarray), "faces must be ndarray" + + # --- PointSetRepresentation --- + def test_pointset_returns_pointset_mesh(self, epc22): + obj = epc22.get_object_by_uuid("fbc5466c-94cd-46ab-8b48-2ae2162b372f") + if not obj: + pytest.skip("PointSet UUID not found in fixture EPC") + multi = read_numpy_mesh_object(obj[0], workspace=epc22) + assert isinstance(multi, NumpyMultiMesh) + patches = multi.flat_patches() + assert patches + for m in patches: + assert isinstance(m, NumpyPointSetMesh) + assert m.points.ndim == 2 + assert m.points.shape[1] == 3 + assert m.points.dtype == np.float64 + + # --- PolylineRepresentation --- + def test_polyline_returns_polyline_mesh(self, epc22): + obj = epc22.get_object_by_uuid("a54b8399-d3ba-4d4b-b215-8d4f8f537e66") + if not obj: + pytest.skip("Polyline UUID not found in fixture EPC") + multi = read_numpy_mesh_object(obj[0], workspace=epc22) + assert isinstance(multi, NumpyMultiMesh) + patches = multi.flat_patches() + assert patches + for m in patches: + assert isinstance(m, NumpyPolylineMesh) + assert m.points.dtype == np.float64 + assert m.lines.dtype == np.int64 + + # --- WellboreFrameRepresentation --- + def test_wellbore_frame_returns_polyline(self, epc22): + obj = epc22.get_object_by_uuid("d873e243-d893-41ab-9a3e-d20b851c099f") + if not obj: + pytest.skip("WellboreFrame UUID not found in fixture EPC") + multi = read_numpy_mesh_object(obj[0], workspace=epc22) + assert isinstance(multi, NumpyMultiMesh) + patches = multi.flat_patches() + assert patches + for m in patches: + assert isinstance(m, NumpyPolylineMesh) + assert m.points.ndim == 2 + assert m.points.shape[1] == 3 + + def test_wellbore_frame_lines_vtk_format(self, epc22): + obj = epc22.get_object_by_uuid("d873e243-d893-41ab-9a3e-d20b851c099f") + if not obj: + pytest.skip("WellboreFrame UUID not found in fixture EPC") + multi = read_numpy_mesh_object(obj[0], workspace=epc22) + for m in multi.flat_patches(): + assert isinstance(m, NumpyPolylineMesh) + if len(m.lines) > 0: + # First element is count (number of points in first line segment) + assert m.lines[0] == 2, "VTK segment should start with count=2" + + # --- Grid2dRepresentation --- + def test_grid2d_returns_surface_mesh(self, epc22): + # Try to find a Grid2dRepresentation in the EPC + all_objs = epc22.list_objects() + grid2d_uuids = [ + r.uuid for r in all_objs + if "Grid2d" in (r.object_type or "") + ] + if not grid2d_uuids: + pytest.skip("No Grid2dRepresentation found in testingPackageCpp22.epc") + obj = epc22.get_object_by_uuid(grid2d_uuids[0]) + if not obj: + pytest.skip("Grid2d object not found") + meshes = read_numpy_mesh_object(obj[0], workspace=epc22) + for m in meshes: + assert isinstance(m, NumpySurfaceMesh) + assert m.points.dtype == np.float64 + if len(m.faces) > 0: + assert m.faces[0] == 4, "Grid2d quads: first VTK face entry must be 4" + + # --- RepresentationSet --- + def test_representation_set_returns_mixed_mesh_list(self, epc22): + obj = epc22.get_object_by_uuid("6b992199-5b47-4624-a62c-b70857133cda") + if not obj: + pytest.skip("RepresentationSet UUID not found in fixture EPC") + multi = read_numpy_mesh_object(obj[0], workspace=epc22) + assert isinstance(multi, NumpyMultiMesh) + for m in multi.flat_patches(): + assert isinstance(m, NumpyMesh) + + # --- IjkGrid + UnstructuredGrid: return empty when geometry is missing --- + + def test_ijk_grid_returns_empty_when_no_ni_nj_nk(self, epc22): + """Reader returns an empty NumpyMultiMesh when ni/nj/nk are absent.""" + from energyml.utils.data.mesh_numpy import read_numpy_ijk_grid_representation + mock_obj = MagicMock() + mock_obj.ni = None + mock_obj.nj = None + mock_obj.nk = None + result = read_numpy_ijk_grid_representation(mock_obj, epc22) + assert isinstance(result, NumpyMultiMesh) + assert result.patch_count() == 0 + + def test_ijk_grid_parametric_raises_not_supported(self): + """Reader raises NotSupportedError for Point3DParametricArray geometry.""" + from energyml.utils.exception import NotSupportedError + from energyml.utils.data.mesh_numpy import read_numpy_ijk_grid_representation + mock_obj = MagicMock() + mock_obj.ni = 2 + mock_obj.nj = 2 + mock_obj.nk = 1 + mock_obj.kgaps = None + # Create a real instance of a class named "Point3DParametricArray" so that + # type(pts_obj).__name__ == "Point3DParametricArray" (contains "Parametric"). + # Using MagicMock().__class__ = ... does NOT affect type(), only __class__. + mock_pts = type("Point3DParametricArray", (), {})() + mock_geom = MagicMock() + mock_geom.column_layer_split_coordinate_lines = None + # search_attribute_matching_name_with_path will find a parametric Points obj + from unittest.mock import patch as mock_patch + with mock_patch( + "energyml.utils.data.mesh_numpy.search_attribute_matching_name_with_path", + return_value=[("Points", mock_pts)], + ), mock_patch("energyml.utils.data.mesh_numpy.get_obj_uri", return_value="mock-uri"): + mock_obj.geometry = mock_geom + with pytest.raises(NotSupportedError): + read_numpy_ijk_grid_representation(mock_obj) + + def test_unstructured_grid_returns_empty_when_no_geometry(self, epc22): + """Reader returns an empty NumpyMultiMesh when geometry is absent.""" + from energyml.utils.data.mesh_numpy import read_numpy_unstructured_grid_representation + mock_obj = MagicMock() + mock_obj.geometry = None + result = read_numpy_unstructured_grid_representation(mock_obj, epc22) + assert isinstance(result, NumpyMultiMesh) + assert result.patch_count() == 0 + + +# --------------------------------------------------------------------------- +# 8. numpy_mesh_to_pyvista round-trip +# --------------------------------------------------------------------------- + +try: + import pyvista as _pyvista + _PYVISTA_AVAILABLE = True +except ImportError: + _PYVISTA_AVAILABLE = False + + +@pytest.mark.skipif(not _PYVISTA_AVAILABLE, reason="pyvista not installed") +class TestNumpyMeshToPyvista: + def test_surface_mesh(self): + pts = np.array([[0, 0, 0], [1, 0, 0], [0, 1, 0]], dtype=np.float64) + faces = np.array([3, 0, 1, 2], dtype=np.int64) + m = NumpySurfaceMesh(points=pts, faces=faces) + pv_mesh = numpy_mesh_to_pyvista(m) + import pyvista + assert isinstance(pv_mesh, pyvista.PolyData) + assert pv_mesh.n_points == 3 + assert pv_mesh.n_cells == 1 + + def test_polyline_mesh(self): + pts = np.array([[0, 0, 0], [1, 0, 0], [2, 0, 0]], dtype=np.float64) + lines = _build_vtk_lines_from_segments(3) + m = NumpyPolylineMesh(points=pts, lines=lines) + pv_mesh = numpy_mesh_to_pyvista(m) + import pyvista + assert isinstance(pv_mesh, pyvista.PolyData) + assert pv_mesh.n_points == 3 + + def test_point_set_mesh(self): + pts = np.random.rand(10, 3).astype(np.float64) + m = NumpyPointSetMesh(points=pts) + pv_mesh = numpy_mesh_to_pyvista(m) + import pyvista + assert isinstance(pv_mesh, pyvista.PolyData) + assert pv_mesh.n_points == 10 + + def test_pyvista_missing_raises_import_error(self, monkeypatch): + """When pyvista is not importable, numpy_mesh_to_pyvista raises ImportError.""" + import builtins + real_import = builtins.__import__ + + def _mock_import(name, *args, **kwargs): + if name == "pyvista": + raise ImportError("mocked missing pyvista") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", _mock_import) + m = NumpyPointSetMesh() + with pytest.raises(ImportError, match="pyvista"): + numpy_mesh_to_pyvista(m) + + def test_to_pyvista_method(self): + """NumpyMesh.to_pyvista() convenience method.""" + pts = np.array([[0, 0, 0], [1, 0, 0], [0, 1, 0]], dtype=np.float64) + faces = np.array([3, 0, 1, 2], dtype=np.int64) + m = NumpySurfaceMesh(points=pts, faces=faces) + pv_mesh = m.to_pyvista() + import pyvista + assert isinstance(pv_mesh, pyvista.PolyData) diff --git a/energyml-utils/tests/test_uri.py b/energyml-utils/tests/test_uri.py index 5dda5a3..4ecf994 100644 --- a/energyml-utils/tests/test_uri.py +++ b/energyml-utils/tests/test_uri.py @@ -1,7 +1,8 @@ # Copyright (c) 2023-2024 Geosiris. # SPDX-License-Identifier: Apache-2.0 -from energyml.utils.uri import Uri, parse_uri +from energyml.utils.exception import NotUriError +from energyml.utils.uri import Uri, parse_uri, parse_uri_raise_if_failed from energyml.utils.introspection import get_obj_uri from energyml.resqml.v2_0_1.resqmlv2 import TriangulatedSetRepresentation, ObjTriangulatedSetRepresentation @@ -46,8 +47,17 @@ def test_uri_eq(): def test_uri_error(): - assert parse_uri("eml//") is None - assert parse_uri("a random text") is None + try: + parse_uri_raise_if_failed("eml//") + raise AssertionError("Expected NotUriError to be raised") + except NotUriError: + pass + + try: + parse_uri_raise_if_failed("a random text") + raise AssertionError("Expected NotUriError to be raised") + except NotUriError: + pass def test_uri_default_dataspace(): @@ -111,6 +121,18 @@ def test_uri_full(): assert uri == str(parse_uri(uri)) +def test_uri_content_type(): + uri = parse_uri( + "eml:///witsml20.Well(uuid=ec8c3f16-1454-4f36-ae10-27d2a2680cf2,version='1.0')/witsml20.Wellbore?query" + ) + assert uri.get_content_type() == "application/x-witsml+xml;version=2.0;type=Well" + + uri = parse_uri( + "eml:///resqml20.obj_HorizonInterpretation(uuid=421a7a05-033a-450d-bcef-051352023578,version='2.0')" + ) + assert uri.get_content_type() == "application/x-resqml+xml;version=2.0;type=obj_HorizonInterpretation" + + def test_uuid(): uri = parse_uri( "eml:///witsml20.Well(uuid=ec8c3f16-1454-4f36-ae10-27d2a2680cf2,version='1.0')/witsml20.Wellbore?query" diff --git a/energyml-utils/tests/test_xml.py b/energyml-utils/tests/test_xml.py index bfd3309..769fc50 100644 --- a/energyml-utils/tests/test_xml.py +++ b/energyml-utils/tests/test_xml.py @@ -4,7 +4,7 @@ import logging from energyml.utils.constants import parse_qualified_type -from src.energyml.utils.xml import * +from src.energyml.utils.xml_utils import * CT_20 = "application/x-resqml+xml;version=2.0;type=obj_TriangulatedSetRepresentation" CT_22 = "application/x-resqml+xml;version=2.2;type=TriangulatedSetRepresentation"