From bb1845ab0e5fae6a37da7cf8f509c2297418b2b9 Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Tue, 7 Apr 2026 11:37:45 +0200 Subject: [PATCH 01/12] new properties (license, version) from citation.cff. Fixes #935 --- src/somef/process_files.py | 30 +++++++----- src/somef/test/test_JSON_export.py | 49 ++++++++++++++++++- src/somef/test/test_codemeta_export.py | 1 + .../repositories/somef_repo/CITATION.cff | 43 ++++++++++++++++ src/somef/utils/constants.py | 2 + 5 files changed, 111 insertions(+), 14 deletions(-) create mode 100644 src/somef/test/test_data/repositories/somef_repo/CITATION.cff diff --git a/src/somef/process_files.py b/src/somef/process_files.py index 9edc6baf..f4f459f4 100644 --- a/src/somef/process_files.py +++ b/src/somef/process_files.py @@ -267,6 +267,7 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner filename.lower() == "pyproject.toml" or filename.lower() == "setup.py" or filename.endswith(".gemspec") or \ filename.lower() == "requirements.txt" or filename.lower() == "bower.json" or filename == "DESCRIPTION" or \ (filename.lower() == "environment.yml" or filename.lower() == "environment.yaml") or \ + (filename.lower() == ".zenodo.json") or \ (filename.lower() == "cargo.toml" and repo_relative_path == ".") or (filename.lower() == "composer.json" and repo_relative_path == ".") or \ (filename == "Project.toml" or (filename.lower()== "publiccode.yml" or filename.lower()== "publiccode.yaml") and repo_relative_path == "."): if filename.lower() in parsed_build_files and repo_relative_path != ".": @@ -309,9 +310,9 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner if filename.lower() == "publiccode.yml" or filename.lower() == "publiccode.yaml": metadata_result = parse_publiccode_file(os.path.join(dir_path, filename), metadata_result, build_file_url) if filename.lower() == "environment.yml" or filename.lower() == "environment.yaml": - print("Processing conda environment file...") metadata_result = parse_conda_environment_file(os.path.join(dir_path, filename), metadata_result, build_file_url) - + # if filename.lower() == ".zenodo": + # metadata_result = parse_zenodo_file(os.path.join(dir_path, filename), metadata_result, build_file_url) parsed_build_files.add(filename.lower()) # if repo_type == constants.RepositoryType.GITLAB: @@ -567,6 +568,8 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul yaml_content = yaml.safe_load(file_text) preferred_citation = yaml_content.get("preferred-citation", {}) doi = yaml_content.get("doi") or preferred_citation.get("doi") + license_citation = yaml_content.get(constants.PROP_LICENSE) + version_citation = yaml_content.get(constants.PROP_VERSION) identifiers = yaml_content.get("identifiers", []) url_citation = preferred_citation.get("url") or yaml_content.get("url") @@ -576,7 +579,7 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul identifier_url = next((id["value"] for id in identifiers if id["type"] == "url"), None) identifier_doi = next((id["value"] for id in identifiers if id["type"] == "doi"), None) - title = yaml_content.get("title") or preferred_citation.get("title", None) + title = yaml_content.get(constants.PROP_TITLE) or preferred_citation.get(constants.PROP_TITLE, None) authors = yaml_content.get("authors", []) if identifier_doi: @@ -594,15 +597,15 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul for author in authors: family_name = author.get("family-names") given_name = author.get("given-names") - orcid = author.get("orcid") - name = author.get("name") + orcid = author.get("orcid") + name = author.get(constants.PROP_NAME) if family_name and given_name: author_entry = { - "type": "Agent", - "name": f"{given_name} {family_name}", - "family_name": family_name, - "given_name": given_name + constants.PROP_TYPE: "Agent", + constants.PROP_NAME: f"{given_name} {family_name}", + constants.PROP_FAMILY_NAME: family_name, + constants.PROP_GIVEN_NAME: given_name } if orcid: if not orcid.startswith("http"): # check if is a url @@ -613,8 +616,8 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul # it could be not enough acurate author_entry = { - "type": "Agent", - "name": name + constants.PROP_TYPE: "Agent", + constants.PROP_NAME: name } author_list.append({k: v for k, v in author_entry.items() if v is not None}) @@ -627,7 +630,10 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul result[constants.PROP_URL] = final_url if doi: result[constants.PROP_DOI] = doi - + if license_citation: + result[constants.PROP_LICENSE] = license_citation + if version_citation: + result[constants.PROP_VERSION] = version_citation if format_result != "": result[constants.PROP_FORMAT] = format_result diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py index afaf5b14..cc5b1864 100644 --- a/src/somef/test/test_JSON_export.py +++ b/src/somef/test/test_JSON_export.py @@ -156,7 +156,7 @@ def test_issue_629(self): "doi" in entry.get("result", {}) and "title" in entry.get("result", {}) for entry in citation - ), "Citation.cff must have doi and title" + ), "Citation.cff must have doi and title in the result" # os.remove(test_data_path + "test_issue_629.json") @@ -528,7 +528,7 @@ def test_unify_json(self): json_content = json.load(f) requirements = json_content.get(constants.CAT_REQUIREMENTS, []) - print(json.dumps(requirements, indent=2)) + # print(json.dumps(requirements, indent=2)) unified_reqs = [ r for r in requirements if "You will need Java 1.8" in r["result"].get("value", "") ] assert unified_reqs, "There should be at least one unified Java requirement entry" @@ -587,6 +587,51 @@ def test_unify_json_2(self): os.remove(test_data_path + "test_somef_unify.json") + + def test_new_properties_citation_issue_935(self): + """ + Checks that duplicated requirement entries extracted by different techniques + are unified into a single item, preserving all complementary information + (techniques, sources, and result fields). + """ + + output_path = test_data_path + 'test_new_properties_citation_issue_935.json' + + somef_cli.run_cli( threshold=0.8, + local_repo=test_data_repositories + "somef_repo", + doc_src=None, + in_file=None, + output=output_path, + graph_out=None, + graph_format="turtle", + codemeta_out=None, + pretty=True, + missing=False, + readme_only=False) + + + with open(output_path, "r") as f: + json_content = json.load(f) + + citation = json_content.get(constants.CAT_CITATION, []) + + cff_entry = next( + (entry for entry in citation if entry["result"].get("format") == "cff"), + None + ) + + assert cff_entry is not None + + result = cff_entry["result"] + + assert "doi" in result + assert "title" in result + assert result["license"] == "Apache-2.0" + assert result["version"] == "0.1.0" + + os.remove(test_data_path + "test_new_properties_citation_issue_935.json") + + @unittest.skipIf(os.getenv("CI") == "true", "Skipped in CI because it is already verified locally") def test_issue_gitlab_enrich_authors(self): """Tests if a gitlab repository with codeowners file gets enriched with the information of the users in the codeowners file. diff --git a/src/somef/test/test_codemeta_export.py b/src/somef/test/test_codemeta_export.py index 84d1a7f6..b4d91d7a 100644 --- a/src/somef/test/test_codemeta_export.py +++ b/src/somef/test/test_codemeta_export.py @@ -611,6 +611,7 @@ def test_issue_886_apache_code(self): json_content = json.loads(data) copyright_holder = json_content[constants.CAT_CODEMETA_COPYRIGHTHOLDER] + print(copyright_holder) copyright_year = json_content[constants.CAT_CODEMETA_COPYRIGHTYEAR] assert copyright_holder == "Daniel Garijo, Information Sciences Institute, USC." diff --git a/src/somef/test/test_data/repositories/somef_repo/CITATION.cff b/src/somef/test/test_data/repositories/somef_repo/CITATION.cff new file mode 100644 index 00000000..ee3ba814 --- /dev/null +++ b/src/somef/test/test_data/repositories/somef_repo/CITATION.cff @@ -0,0 +1,43 @@ +# This CITATION.cff file was generated with cffinit. +# Visit https://bit.ly/cffinit to generate yours today! + +cff-version: 1.2.0 +title: 'SOMEF: Software metadata extraction framework' +message: >- + If you use this software, please cite both the article + from preferred-citation and the software itself. +type: software +authors: + - family-names: Garijo + given-names: Daniel + orcid: 'https://orcid.org/0000-0003-0454-7145' + - family-names: Mao + given-names: Allen + - family-names: Dharmala + given-names: Haripriya + - family-names: Diwanji + given-names: Cedant + - family-names: Wang + given-names: Jiajing + - family-names: Kelley + given-names: Aidan + - family-names: García + given-names: Miguel Angel + - family-names: Ciuciu-Kiss + given-names: Jenifer + - family-names: Mendoza + given-names: Juanje +license: Apache-2.0 +version: 0.1.0 +preferred-citation: + authors: + - family-names: Kelley + given-names: Aidan + - family-names: Garijo + given-names: Daniel + title: A Framework for Creating Knowledge Graphs of Scientific Software Metadata + type: article + journal: Quantitative Science Studies + pages: 1-37 + year: 2021 + doi: 10.1162/qss_a_00167 \ No newline at end of file diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 4d2c476e..7c3a208f 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -230,9 +230,11 @@ PROP_DEPENDENCY_RESOLVER = "dependency_resolver" PROP_EMAIL = "email" PROP_GIVEN_NAME = "given_name" +PROP_FAMILY_NAME = "family_name" PROP_HTML_URL = "html_url" PROP_IDENTIFIER = "identifier" PROP_LAST_NAME = "last_name" +PROP_LICENSE = "license" PROP_NAME = "name" PROP_ORIGINAL_HEADER = "original_header" PROP_PARENT_HEADER = "parent_header" From f76adad42efcbb506070bf338d07d59486867b8c Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Fri, 10 Apr 2026 09:19:56 +0200 Subject: [PATCH 02/12] Improve extraction and structure of citation and reference publications --- README.md | 16 +- docs/index.md | 18 +- docs/output.md | 6 +- src/somef/export/json_export.py | 234 +++++++++++----------- src/somef/process_files.py | 216 +++++++++++++------- src/somef/regular_expressions.py | 23 ++- src/somef/test/test_JSON_export.py | 31 ++- src/somef/test/test_process_repository.py | 6 +- src/somef/utils/constants.py | 3 + 9 files changed, 345 insertions(+), 208 deletions(-) diff --git a/README.md b/README.md index 73f323ff..01bedd13 100644 --- a/README.md +++ b/README.md @@ -24,12 +24,16 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo - Email: email of author - URL: website or ORCID associated with the author - **Build file**: Build file(s) of the project. For example, files used to create a Docker image for the target software, package files, etc. -- **Citation**: Preferred citation as the authors have stated in their readme file. SOMEF recognizes Bibtex, Citation File Format files and other means by which authors cite their papers (e.g., by in-text citation). We aim to recognize the following properties: +- **Citation**: Preferred citation as the authors have stated in their readme file. SOMEF recognizes Bibtex, Citation File Format files and other means by which authors cite their papers (e.g., by in-text citation). +SOMEF now generates two separate entries: one for the software and another for the preferred citation which corresponds to the reference_publication category. This ensures metadata like DOI or version is correctly assigned to each entity. +We aim to recognize the following properties: - Title: Title of the publication - Author: list of author names in the publication - URL: URL of the publication - DOI: Digital object identifier of the publication - Date published + - License: Software license (if applicable) + - Version: Software version (if applicable) - **Code of conduct**: Link to the code of conduct of the project - **Code repository**: Link to the GitHub/GitLab repository used for the extraction - **Contact**: Contact person responsible for maintaining a software component @@ -63,6 +67,16 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo - **Package distribution**: Links to package sites like pypi in case the repository has a package available. - **Package files**: Links to package files used to wrap the project in a package. - **Programming languages**: Languages used in the repository +- **Reference publication**: Scholarly works (e.g., articles, books) associated with the software that should be cited, either instead of or in addition to the software itself. These references can be extracted from multiple sources, such as CITATION.cff files (e.g., `preferred-citation`), BibTeX entries, or citation text in documentation (e.g., README files). +We aim to recognize the following properties across these categories: + - Title: Title of the publication + - Author: list of author names in the publication + - URL: URL of the publication + - DOI: Digital object identifier of the publication + - Date published + - Journal: Journal name where the paper was published + - Year: Year of publication + - Pages: Page range in the journal - **Related papers**: URL to possible related papers within the repository stated within the readme file (from Arxiv) - **Releases** (GitHub only): Pointer to the available versions of a software component. For each release, somef will track the following properties: - Description: Release notes diff --git a/docs/index.md b/docs/index.md index ac13d86e..5bf07268 100644 --- a/docs/index.md +++ b/docs/index.md @@ -31,12 +31,16 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca - URL: website or ORCID associated with the author - Affiliation: name of organization or affiliation - **Build file**: Build file(s) of the project. For example, files used to create a Docker image for the target software, package files, etc. -- **Citation**: Preferred citation as the authors have stated in their readme file. SOMEF recognizes Bibtex, Citation File Format files and other means by which authors cite their papers (e.g., by in-text citation). We aim to recognize the following properties: +- **Citation**: Preferred citation as the authors have stated in their readme file. SOMEF recognizes Bibtex, Citation File Format files and other means by which authors cite their papers (e.g., by in-text citation). +SOMEF now generates two separate entries: one for the software and another for the preferred citation which corresponds to the reference_publication category. This ensures metadata like DOI or version is correctly assigned to each entity. +We aim to recognize the following properties: - Title: Title of the publication - Author: list of author names in the publication - URL: URL of the publication - DOI: Digital object identifier of the publication - - Date published: + - Date published + - License: Software license (if applicable) + - Version: Software version (if applicable) - **Code of conduct**: Link to the code of conduct of the project - **Code repository**: Link to the GitHub/GitLab repository used for the extraction - **Contact**: Contact person responsible for maintaining a software component @@ -70,6 +74,16 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca - **Package distribution**: Links to package sites like pypi in case the repository has a package available. - **Package files**: Links to package files used to wrap the project in a package. - **Programming languages**: Languages used in the repository +- **Reference publication**: Scholarly works (e.g., articles, books) associated with the software that should be cited, either instead of or in addition to the software itself. These references can be extracted from multiple sources, such as CITATION.cff files (e.g., `preferred-citation`), BibTeX entries, or citation text in documentation (e.g., README files). +We aim to recognize the following properties across these categories: + - Title: Title of the publication + - Author: list of author names in the publication + - URL: URL of the publication + - DOI: Digital object identifier of the publication + - Date published + - Journal: Journal name where the paper was published + - Year: Year of publication + - Pages: Page range in the journal - **Related papers**: URL to possible related papers within the repository stated within the readme file (from Arxiv) - **Releases** (GitHub and Gitlab): Pointer to the available versions of a software component. For each release, somef will track the following properties: - Assets: files attached to the release diff --git a/docs/output.md b/docs/output.md index 30831499..ded5f641 100644 --- a/docs/output.md +++ b/docs/output.md @@ -119,7 +119,7 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `usage`: Usage examples and considerations of a code repository. - `workflows`: URL and path to the computational workflow files present in the repository. - `homepage`: URL to the homepage of the software or organization. -- `reference_publication`: URL to the paper associated with the code repository. +- `reference_publication`: Scholarly publications associated with the repository (e.g., articles, books). Each entry may include structured metadata such as title, authors, DOI, URL, journal, publication date, and pagination. - `package_id`: Identifier extracted from packages. (e.g., `packages.json`) - `funding`: Funding code for the related project. - `has_package_file`: Specifies what package file is present in the code repository. @@ -308,6 +308,10 @@ A SCHOLARLY_ARTICLE has the following properties: | Property | Expected value | Definition | |---|---|---| | **title** | String | Title of reference or citation | +| **authors** | List | List of authors with structured information (name, given_name, family_name) | +| **journal** | String | Journal where the publication appeared | +| **year** | Number | Year of publication | +| **pages** | String | Page range of the publication | | **value** | String | Title of reference or citation | | **url** | String | Link to reference or citation | | **date_published** | String | date of publication reference or citation | diff --git a/src/somef/export/json_export.py b/src/somef/export/json_export.py index be1d66b1..6bfbea67 100644 --- a/src/somef/export/json_export.py +++ b/src/somef/export/json_export.py @@ -391,124 +391,129 @@ def format_date(date_string): # codemeta_output[constants.CAT_CODEMETA_AUTHOR].append(author_l) - if constants.CAT_CITATION in repo_data: - # url_cit = [] + if constants.CAT_REFERENCE_PUBLICATION in repo_data or constants.CAT_CITATION in repo_data: codemeta_output[constants.CAT_CODEMETA_REFERENCEPUBLICATION] = [] - all_reference_publications = [] - # scholarlyArticles = {} author_orcids = {} + all_reference_publications = [] - for cit in repo_data[constants.CAT_CITATION]: - scholarlyArticle = {"@type": "ScholarlyArticle"} - - doi = None - title = None - is_bibtex = False - - if constants.PROP_FORMAT in cit[constants.PROP_RESULT] and cit[constants.PROP_RESULT][constants.PROP_FORMAT] == "cff": - yaml_content = yaml.safe_load(cit[constants.PROP_RESULT]["value"]) - preferred_citation = yaml_content.get("preferred-citation", {}) - doi = yaml_content.get("doi") or preferred_citation.get("doi") - identifiers = yaml_content.get("identifiers", []) - url_citation = preferred_citation.get("url") or yaml_content.get("url") - identifier_url = next((id["value"] for id in identifiers if id["type"] == "url"), None) - identifier_doi = next((id["value"] for id in identifiers if id["type"] == "doi"), None) - - authors = yaml_content.get("authors", []) - - title = normalize_title(preferred_citation.get("title") or yaml_content.get("title")) - - if identifier_doi: - final_url = f"https://doi.org/{identifier_doi}" - elif doi: - final_url = f"https://doi.org/{doi}" - elif identifier_url: - final_url = identifier_url - elif url_citation: - final_url = url_citation - else: - final_url = '' - - scholarlyArticle[constants.PROP_NAME] = title - scholarlyArticle[constants.CAT_IDENTIFIER] = doi - scholarlyArticle[constants.PROP_URL] = final_url - - author_list = [] - for author in authors: - family_name = author.get("family-names") - given_name = author.get("given-names") - orcid = author.get("orcid") - name = author.get("name") - - if family_name and given_name: - author_entry = { - "@type": "Person", - "familyName": family_name, - "givenName": given_name - } - if orcid: - if not orcid.startswith("http"): # check if orcid is a url - orcid = f"https://orcid.org/{orcid}" - author_entry["@id"] = orcid - elif name: - # If there is only a name, we assume this to be an Organization. - # it could be not enough acurate - - author_entry = { - "@type": "Organization", - "name": name - } - - if family_name and given_name and orcid: - key = (family_name.lower(), given_name.lower()) - author_orcids[key] = orcid - - author_list.append({k: v for k, v in author_entry.items() if v is not None}) - - if author_list: - scholarlyArticle[constants.PROP_AUTHOR] = author_list - else: - if constants.PROP_DOI in cit[constants.PROP_RESULT].keys(): - doi = cit[constants.PROP_RESULT][constants.PROP_DOI] - scholarlyArticle[constants.CAT_IDENTIFIER] = cit[constants.PROP_RESULT][constants.PROP_DOI] - - if constants.PROP_URL in cit[constants.PROP_RESULT].keys(): - scholarlyArticle[constants.PROP_URL] = cit[constants.PROP_RESULT][constants.PROP_URL] - - if constants.PROP_TITLE in cit[constants.PROP_RESULT].keys(): - title = normalize_title(cit[constants.PROP_RESULT][constants.PROP_TITLE]) - scholarlyArticle[constants.PROP_NAME] = cit[constants.PROP_RESULT][constants.PROP_TITLE] - - if constants.PROP_ORIGINAL_HEADER in cit[constants.PROP_RESULT].keys(): - if cit[constants.PROP_RESULT][constants.PROP_ORIGINAL_HEADER] == "Citation": - if constants.PROP_SOURCE in cit.keys(): - scholarlyArticle[constants.PROP_URL] = cit[constants.PROP_SOURCE] - - is_bibtex = True - - if len(scholarlyArticle) > 1: - # look por information in values as pagination, issn and others - if re.search(r'@\w+\{', cit[constants.PROP_RESULT][constants.PROP_VALUE]): - scholarlyArticle = extract_scholarly_article_properties(cit[constants.PROP_RESULT][constants.PROP_VALUE], scholarlyArticle, 'CODEMETA') + if constants.CAT_REFERENCE_PUBLICATION in repo_data: + publications_source = repo_data[constants.CAT_REFERENCE_PUBLICATION] + elif constants.CAT_CITATION in repo_data: + publications_source = repo_data[constants.CAT_CITATION] + else: + publications_source = [] + + if publications_source: + # for cit in repo_data[constants.CAT_REFERENCE_PUBLICATION]: + for cit in publications_source: + scholarlyArticle = {"@type": "ScholarlyArticle"} + + doi = None + title = None + is_bibtex = False + + if constants.PROP_FORMAT in cit[constants.PROP_RESULT] and cit[constants.PROP_RESULT][constants.PROP_FORMAT] == "cff": + yaml_content = yaml.safe_load(cit[constants.PROP_RESULT]["value"]) + preferred_citation = yaml_content.get("preferred-citation", {}) + doi = yaml_content.get("doi") or preferred_citation.get("doi") + identifiers = yaml_content.get("identifiers", []) + url_citation = preferred_citation.get("url") or yaml_content.get("url") + identifier_url = next((id["value"] for id in identifiers if id["type"] == "url"), None) + identifier_doi = next((id["value"] for id in identifiers if id["type"] == "doi"), None) + authors = yaml_content.get("authors", []) or preferred_citation.get("authors", []) + title = normalize_title(preferred_citation.get("title") or yaml_content.get("title")) + + if identifier_doi: + final_url = f"https://doi.org/{identifier_doi}" + elif doi: + final_url = f"https://doi.org/{doi}" + elif identifier_url: + final_url = identifier_url + elif url_citation: + final_url = url_citation + else: + final_url = '' + + scholarlyArticle[constants.PROP_NAME] = title + scholarlyArticle[constants.CAT_IDENTIFIER] = doi + scholarlyArticle[constants.PROP_URL] = final_url + + author_list = [] + for author in authors: + family_name = author.get("family-names") + given_name = author.get("given-names") + orcid = author.get("orcid") + name = author.get("name") + + if family_name and given_name: + author_entry = { + "@type": "Person", + "familyName": family_name, + "givenName": given_name + } + if orcid: + if not orcid.startswith("http"): # check if orcid is a url + orcid = f"https://orcid.org/{orcid}" + author_entry["@id"] = orcid + elif name: + # If there is only a name, we assume this to be an Organization. + # it could be not enough acurate + + author_entry = { + "@type": "Organization", + "name": name + } + + if family_name and given_name and orcid: + key = (family_name.lower(), given_name.lower()) + author_orcids[key] = orcid + + author_list.append({k: v for k, v in author_entry.items() if v is not None}) + + if author_list: + scholarlyArticle[constants.PROP_AUTHOR] = author_list else: - scholarlyArticle = extract_scholarly_article_natural(cit[constants.PROP_RESULT][constants.PROP_VALUE], scholarlyArticle, 'CODEMETA') - - all_reference_publications.append({ - **scholarlyArticle, - "_source_format": "cff" if not is_bibtex else "bibtex" - }) - - for article in all_reference_publications: - if "author" in article: - for author in article["author"]: - family_name = author.get("familyName", "").strip() - given_name = author.get("givenName", "").strip() - key = (family_name.lower(), given_name.lower()) if given_name else None - - if key and key in author_orcids: - author["@id"] = author_orcids[key] - - codemeta_output[constants.CAT_CODEMETA_REFERENCEPUBLICATION] = deduplicate_publications(all_reference_publications) + if constants.PROP_DOI in cit[constants.PROP_RESULT].keys(): + doi = cit[constants.PROP_RESULT][constants.PROP_DOI] + scholarlyArticle[constants.CAT_IDENTIFIER] = cit[constants.PROP_RESULT][constants.PROP_DOI] + + if constants.PROP_URL in cit[constants.PROP_RESULT].keys(): + scholarlyArticle[constants.PROP_URL] = cit[constants.PROP_RESULT][constants.PROP_URL] + + if constants.PROP_TITLE in cit[constants.PROP_RESULT].keys(): + title = normalize_title(cit[constants.PROP_RESULT][constants.PROP_TITLE]) + scholarlyArticle[constants.PROP_NAME] = cit[constants.PROP_RESULT][constants.PROP_TITLE] + + if constants.PROP_ORIGINAL_HEADER in cit[constants.PROP_RESULT].keys(): + if cit[constants.PROP_RESULT][constants.PROP_ORIGINAL_HEADER] == "Citation": + if constants.PROP_SOURCE in cit.keys(): + scholarlyArticle[constants.PROP_URL] = cit[constants.PROP_SOURCE] + + is_bibtex = True + + if len(scholarlyArticle) > 1: + # look por information in values as pagination, issn and others + if re.search(r'@\w+\{', cit[constants.PROP_RESULT][constants.PROP_VALUE]): + scholarlyArticle = extract_scholarly_article_properties(cit[constants.PROP_RESULT][constants.PROP_VALUE], scholarlyArticle, 'CODEMETA') + else: + scholarlyArticle = extract_scholarly_article_natural(cit[constants.PROP_RESULT][constants.PROP_VALUE], scholarlyArticle, 'CODEMETA') + + all_reference_publications.append({ + **scholarlyArticle, + "_source_format": "cff" if not is_bibtex else "bibtex" + }) + + for article in all_reference_publications: + if "author" in article: + for author in article["author"]: + family_name = author.get("familyName", "").strip() + given_name = author.get("givenName", "").strip() + key = (family_name.lower(), given_name.lower()) if given_name else None + + if key and key in author_orcids: + author["@id"] = author_orcids[key] + + codemeta_output[constants.CAT_CODEMETA_REFERENCEPUBLICATION] = deduplicate_publications(all_reference_publications) if constants.CAT_STATUS in repo_data: url_status = repo_data[constants.CAT_STATUS][0]['result'].get('value', '') @@ -840,7 +845,6 @@ def unify_results(repo_data: dict) -> dict: This function canonicalizes simple values, detects equivalent items and merges them into a single unified entry while preserving all available information. """ - print("Unifying results...") unified_data = {} for category, items in repo_data.items(): diff --git a/src/somef/process_files.py b/src/somef/process_files.py index f4f459f4..62c08b99 100644 --- a/src/somef/process_files.py +++ b/src/somef/process_files.py @@ -371,8 +371,10 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner if 'citation' in metadata_result.results: for cit in metadata_result.results['citation']: - scholarly_article = {} result = cit.get(constants.PROP_RESULT, {}) + + scholarly_article = {} + # result = cit.get(constants.PROP_RESULT, {}) value = result.get(constants.PROP_VALUE, '') if re.search(r'@\w+\{', value): scholarly_article = extract_scholarly_article_properties(value, scholarly_article, 'JSON') @@ -565,75 +567,39 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul ) # Properties extraction from cff if format_result == 'cff': - yaml_content = yaml.safe_load(file_text) - preferred_citation = yaml_content.get("preferred-citation", {}) - doi = yaml_content.get("doi") or preferred_citation.get("doi") - license_citation = yaml_content.get(constants.PROP_LICENSE) - version_citation = yaml_content.get(constants.PROP_VERSION) - identifiers = yaml_content.get("identifiers", []) - url_citation = preferred_citation.get("url") or yaml_content.get("url") - - if identifiers: - result[constants.CAT_IDENTIFIER] = identifiers - - identifier_url = next((id["value"] for id in identifiers if id["type"] == "url"), None) - identifier_doi = next((id["value"] for id in identifiers if id["type"] == "doi"), None) - - title = yaml_content.get(constants.PROP_TITLE) or preferred_citation.get(constants.PROP_TITLE, None) - authors = yaml_content.get("authors", []) - - if identifier_doi: - final_url = f"https://doi.org/{identifier_doi}" - elif doi: - final_url = f"https://doi.org/{doi}" - elif identifier_url: - final_url = identifier_url - elif url_citation: - final_url = url_citation - else: - final_url = '' - - author_list = [] - for author in authors: - family_name = author.get("family-names") - given_name = author.get("given-names") - orcid = author.get("orcid") - name = author.get(constants.PROP_NAME) - - if family_name and given_name: - author_entry = { - constants.PROP_TYPE: "Agent", - constants.PROP_NAME: f"{given_name} {family_name}", - constants.PROP_FAMILY_NAME: family_name, - constants.PROP_GIVEN_NAME: given_name - } - if orcid: - if not orcid.startswith("http"): # check if is a url - orcid = f"https://orcid.org/{orcid}" - author_entry["url"] = orcid - elif name: - # If there is only a name, we assume this to be an Organization. - # it could be not enough acurate - - author_entry = { - constants.PROP_TYPE: "Agent", - constants.PROP_NAME: name - } - - author_list.append({k: v for k, v in author_entry.items() if v is not None}) - - if author_list: - result[constants.PROP_AUTHOR] = author_list - if title: - result[constants.PROP_TITLE] = title - if final_url: - result[constants.PROP_URL] = final_url - if doi: - result[constants.PROP_DOI] = doi - if license_citation: - result[constants.PROP_LICENSE] = license_citation - if version_citation: - result[constants.PROP_VERSION] = version_citation + try: + yaml_content = yaml.safe_load(file_text) + except Exception: + yaml_content = None + + if yaml_content: + license_value = yaml_content.get("license") + logging.info(f"Extracted license value from CFF: {license_value}") + if license_value: + if isinstance(license_value, list): + license_value = license_value[0] + parse_license_cff(license_value, metadata_result, url) + + root_result = parse_cff_root(yaml_content, metadata_result,url) + root_result[constants.PROP_VALUE] = file_text + # root_result[constants.PROP_TYPE] = constants.FILE_DUMP + metadata_result.add_result( + category, root_result, 1, + constants.TECHNIQUE_FILE_EXPLORATION, url + ) + + pref = yaml_content.get("preferred-citation") + if pref: + pref_result = parse_cff_preferred(pref) + pref_result[constants.PROP_VALUE] = yaml.dump({"preferred-citation": pref}, default_flow_style=False) + # pref_result[constants.PROP_TYPE] = constants.FILE_DUMP + metadata_result.add_result( + constants.CAT_REFERENCE_PUBLICATION, pref_result, 1, + constants.TECHNIQUE_FILE_EXPLORATION, url + ) + + return metadata_result + if format_result != "": result[constants.PROP_FORMAT] = format_result @@ -641,7 +607,8 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul metadata_result.edit_hierarchical_result(category, result, 1, constants.TECHNIQUE_FILE_EXPLORATION, url) else: metadata_result.add_result(category, result, 1, constants.TECHNIQUE_FILE_EXPLORATION, url) - except: + except Exception as e: + logging.error(f"Error occurred while processing file {url}: {e}") if replace: metadata_result.edit_hierarchical_result(category, { @@ -715,3 +682,110 @@ def clean_text(text): cleaned_lines.append(line) return "\n".join(cleaned_lines) + +def parse_authors_citation(author_list): + authors = [] + for author in author_list: + family = author.get("family-names") + given = author.get("given-names") + orcid = author.get("orcid") + name = author.get(constants.PROP_NAME) + + if family and given: + entry = { + constants.PROP_TYPE: "Agent", + constants.PROP_NAME: f"{given} {family}", + constants.PROP_FAMILY_NAME: family, + constants.PROP_GIVEN_NAME: given + } + if orcid: + if not orcid.startswith("http"): + orcid = f"https://orcid.org/{orcid}" + entry[constants.PROP_URL] = orcid + elif name: + entry = { + constants.PROP_TYPE: "Agent", + constants.PROP_NAME: name + } + else: + continue + + authors.append(entry) + + return authors + + +def parse_cff_root(yaml_content, metadata_result, url): + result = {} + + result[constants.PROP_TITLE] = yaml_content.get("title") + result["authors"] = parse_authors_citation(yaml_content.get("authors", [])) + result[constants.PROP_VERSION] = yaml_content.get("version") + result[constants.PROP_DOI] = yaml_content.get("doi") + result[constants.PROP_URL] = yaml_content.get("url") + result[constants.PROP_TYPE] = constants.SOFTWARE_APPLICATION + # cff_type = yaml_content.get("type") + # result[constants.PROP_TYPE] = cff_type if cff_type else constants.FILE_DUMP + + identifiers = yaml_content.get("identifiers", []) + if identifiers: + result[constants.PROP_IDENTIFIER] = identifiers + + # result[constants.PROP_PREFERRED] = "False" + result[constants.PROP_FORMAT] = "cff" + + return clean_nulls(result) + +def parse_cff_preferred(pref): + result = {} + + result[constants.PROP_TITLE] = pref.get("title") + result["authors"] = parse_authors_citation(pref.get("authors", [])) + result[constants.PROP_DOI] = pref.get("doi") + result[constants.PROP_URL] = pref.get("url") + result[constants.PROP_JOURNAL] = pref.get("journal") + result[constants.PROP_YEAR] = pref.get("year") + result[constants.PROP_PAGES] = pref.get("pages") + result[constants.PROP_TYPE] = constants.SCHOLARLY_ARTICLE + # cff_type = pref.get("type") + # result[constants.PROP_TYPE] = cff_type if cff_type else constants.FILE_DUMP + + # result[constants.PROP_PREFERRED] = "True" + result[constants.PROP_FORMAT] = "cff" + + return clean_nulls(result) + +def clean_nulls(d: dict) -> dict: + return {k: v for k, v in d.items() if v not in (None, "")} + +def parse_license_cff(license_value, metadata_result, url): + + try: + license_info = detect_license_spdx(license_value, 'JSON') + + license_result = { + constants.PROP_VALUE: license_value, + constants.PROP_TYPE: constants.FILE_DUMP + } + + if license_info: + license_result[constants.PROP_NAME] = license_info['name'] + license_result[constants.PROP_SPDX_ID] = license_info['spdx_id'] + + license_result[constants.PROP_URL] = license_info.get("@id") + else: + license_result[constants.PROP_NAME] = license_value + + + metadata_result.add_result( + constants.CAT_LICENSE, + license_result, + 1, + constants.TECHNIQUE_FILE_EXPLORATION, + url + ) + except Exception as e: + logging.error(f"Error parsing license from CFF: {str(e)}") + + + diff --git a/src/somef/regular_expressions.py b/src/somef/regular_expressions.py index 5d5e6538..286705ff 100644 --- a/src/somef/regular_expressions.py +++ b/src/somef/regular_expressions.py @@ -1062,7 +1062,8 @@ def extract_scholarly_article_properties(bibtex_entry, scholarlyArticle, type): year_match = re.search(constants.REGEXP_YEAR, bibtex_entry) month_match = re.search(constants.REGEXP_MONTH, bibtex_entry) pages_match = re.search(constants.REGEXP_PAGES, bibtex_entry) - author_match = re.search(r'author\s*=\s*\{([^}]+)\}', bibtex_entry) + # author_match = re.search(r'author\s*=\s*\{([^}]+)\}', bibtex_entry) + author_match = re.search(r'author\s*=\s*\{(.+?)\}\s*,', bibtex_entry) orcid_match = re.search(r'orcid\s*=\s*\{([^}]+)\}', bibtex_entry) # Look for ORCID explícit note_orcid_match = re.search(r'ORCID[:\s]*([\d-]+X?)', bibtex_entry) # Look in notes @@ -1087,12 +1088,24 @@ def extract_scholarly_article_properties(bibtex_entry, scholarlyArticle, type): authors = author_match.group(1).split(" and ") # Split several authors for author in authors: - parts = author.split(", ") - if len(parts) == 2: + # parts = author.split(", ") + # if len(parts) == 2: + # family_name, given_name = parts + # else: + # family_name = author + # given_name = None + match_author = re.match(r'(.+?)\s*\{(.+?)\}', author) + + if match_author: + given_name = match_author.group(1).strip() + family_name = match_author.group(2).strip() + elif "," in author: + parts = [p.strip() for p in author.split(",", 1)] family_name, given_name = parts else: - family_name = author - given_name = None + parts = author.split() + family_name = parts[-1] + given_name = " ".join(parts[:-1]) if len(parts) > 1 else None if type == 'JSON': author_entry = { diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py index cc5b1864..d7dcf464 100644 --- a/src/somef/test/test_JSON_export.py +++ b/src/somef/test/test_JSON_export.py @@ -613,21 +613,30 @@ def test_new_properties_citation_issue_935(self): with open(output_path, "r") as f: json_content = json.load(f) - citation = json_content.get(constants.CAT_CITATION, []) + citations = json_content.get(constants.CAT_CITATION, []) + referencePublications = json_content.get(constants.CAT_REFERENCE_PUBLICATION, []) - cff_entry = next( - (entry for entry in citation if entry["result"].get("format") == "cff"), + software_entry = next( + (cit for cit in citations if cit["result"]["title"] == 'SOMEF: Software metadata extraction framework'), + None + ) + preferred_entry = next( + (ref for ref in referencePublications if ref["result"]["title"] == "A Framework for Creating Knowledge Graphs of Scientific Software Metadata"), None ) - assert cff_entry is not None - - result = cff_entry["result"] - - assert "doi" in result - assert "title" in result - assert result["license"] == "Apache-2.0" - assert result["version"] == "0.1.0" + assert software_entry is not None, "Software citation (root) not found" + sw_result = software_entry["result"] + assert sw_result["title"] == 'SOMEF: Software metadata extraction framework' + assert sw_result["version"] == "0.1.0" + assert "doi" not in sw_result or sw_result.get("doi") is None # it is in preferred (referencePublication) but not in the root + + assert preferred_entry is not None, "Preferred citation (article) not found" + pref_result = preferred_entry["result"] + assert pref_result["title"] == "A Framework for Creating Knowledge Graphs of Scientific Software Metadata" + assert pref_result["doi"] == "10.1162/qss_a_00167" + assert pref_result["journal"] == "Quantitative Science Studies" + assert "version" not in pref_result # it is in the root in citation but not in the preferred (referencePublication) os.remove(test_data_path + "test_new_properties_citation_issue_935.json") diff --git a/src/somef/test/test_process_repository.py b/src/somef/test/test_process_repository.py index e6b552da..4ee8fafa 100644 --- a/src/somef/test/test_process_repository.py +++ b/src/somef/test/test_process_repository.py @@ -193,6 +193,7 @@ def test_issue_526(self): # after solving issue refernce_publication it must be 2 citations in results citation. # assert len(github_data.results[constants.CAT_CITATION]) == 1 assert len(github_data.results[constants.CAT_CITATION]) == 2 + assert len(github_data.results[constants.CAT_REFERENCE_PUBLICATION]) == 1 def test_issue_530(self): """ @@ -204,8 +205,10 @@ def test_issue_530(self): constants.RepositoryType.LOCAL) licenses = github_data.results[constants.CAT_LICENSE] citation = github_data.results[constants.CAT_CITATION] + # there are two licenses because the codemeta parser obtains one - assert len(licenses) == 2 and "LICENSE" or "codemeta" in licenses[0]["source"] and \ + # after extracting the license from citation.cff now we should have 3 + assert len(licenses) == 3 and "LICENSE" or "codemeta" in licenses[0]["source"] and \ len(citation) == 1 and "example_onto" not in citation[0]["source"] def test_issue_611(self): @@ -333,7 +336,6 @@ def test_issue_905_tag(self): assert os.path.exists(test_data_path + "test_905_tag.json") version = json_content.get(constants.CAT_VERSION, []) - print(version) source = version[0].get("source", "") assert "Widoco/v1.4.25" in source, f"The downloaded tag does not match the requested one. Source: {source}" diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 7c3a208f..a048aee2 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -158,6 +158,7 @@ """ CAT_PROGRAMMING_LANGUAGES = "programming_languages" CAT_README_URL = "readme_url" +CAT_REFERENCE_PUBLICATION = "reference_publication" CAT_RELATED_DOCUMENTATION = "related_documentation" CAT_RELATED_PAPERS = "related_papers" CAT_RELEASES = "releases" @@ -233,10 +234,12 @@ PROP_FAMILY_NAME = "family_name" PROP_HTML_URL = "html_url" PROP_IDENTIFIER = "identifier" +PROP_JOURNAL = "journal" PROP_LAST_NAME = "last_name" PROP_LICENSE = "license" PROP_NAME = "name" PROP_ORIGINAL_HEADER = "original_header" +PROP_PAGES = "pages" PROP_PARENT_HEADER = "parent_header" PROP_RELEASE_ID = "release_id" PROP_ROLE = "role" From da0c5186d7c0ca5521a754b1455df865b3e856ad Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Fri, 10 Apr 2026 16:05:38 +0200 Subject: [PATCH 03/12] Revert reference_publication category to keep all citations in CAT_CITATION. But using is_preferred_citation. Improve test and referencePublication/authors in parser codemeta. Fixes #957 --- README.md | 18 ++-- docs/index.md | 18 ++-- docs/output.md | 2 +- src/somef/export/json_export.py | 11 +-- src/somef/parser/codemeta_parser.py | 100 ++++++++++++++-------- src/somef/process_files.py | 6 +- src/somef/test/test_JSON_export.py | 5 +- src/somef/test/test_codemeta_parser.py | 25 ++++++ src/somef/test/test_process_repository.py | 6 +- src/somef/utils/constants.py | 3 +- 10 files changed, 113 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index 01bedd13..9625b85c 100644 --- a/README.md +++ b/README.md @@ -25,15 +25,17 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo - URL: website or ORCID associated with the author - **Build file**: Build file(s) of the project. For example, files used to create a Docker image for the target software, package files, etc. - **Citation**: Preferred citation as the authors have stated in their readme file. SOMEF recognizes Bibtex, Citation File Format files and other means by which authors cite their papers (e.g., by in-text citation). -SOMEF now generates two separate entries: one for the software and another for the preferred citation which corresponds to the reference_publication category. This ensures metadata like DOI or version is correctly assigned to each entity. +For CITATION.cff files, SOMEF now generates two separate entries: one for the software (is_preferred_citation: False) and another for the preferred citation (is_preferred_citation: True). This ensures metadata like DOI or version is correctly assigned to each entity. We aim to recognize the following properties: - Title: Title of the publication - Author: list of author names in the publication - URL: URL of the publication - DOI: Digital object identifier of the publication - Date published - - License: Software license (if applicable) - Version: Software version (if applicable) + - Journal: Journal name where the paper was published + - Year: Year of publication + - Pages: Page range in the journal - **Code of conduct**: Link to the code of conduct of the project - **Code repository**: Link to the GitHub/GitLab repository used for the extraction - **Contact**: Contact person responsible for maintaining a software component @@ -58,7 +60,7 @@ We aim to recognize the following properties: - **Invocation**: Execution command(s) needed to run a scientific software component - **Issue tracker**: Link where to open issues for the target repository - **Keywords**: set of terms used to commonly identify a software component -- **License**: License and usage terms of a software component +- **License**: License and usage terms of a software component. Now we also extract license from citation.cff. - **Logo**: Main logo used to represent the target software component - **Maintainer**: Individuals or teams responsible for maintaining the software component, extracted from the CODEOWNERS file - **Name**: Name identifying a software component @@ -67,16 +69,6 @@ We aim to recognize the following properties: - **Package distribution**: Links to package sites like pypi in case the repository has a package available. - **Package files**: Links to package files used to wrap the project in a package. - **Programming languages**: Languages used in the repository -- **Reference publication**: Scholarly works (e.g., articles, books) associated with the software that should be cited, either instead of or in addition to the software itself. These references can be extracted from multiple sources, such as CITATION.cff files (e.g., `preferred-citation`), BibTeX entries, or citation text in documentation (e.g., README files). -We aim to recognize the following properties across these categories: - - Title: Title of the publication - - Author: list of author names in the publication - - URL: URL of the publication - - DOI: Digital object identifier of the publication - - Date published - - Journal: Journal name where the paper was published - - Year: Year of publication - - Pages: Page range in the journal - **Related papers**: URL to possible related papers within the repository stated within the readme file (from Arxiv) - **Releases** (GitHub only): Pointer to the available versions of a software component. For each release, somef will track the following properties: - Description: Release notes diff --git a/docs/index.md b/docs/index.md index 5bf07268..a01fa373 100644 --- a/docs/index.md +++ b/docs/index.md @@ -32,15 +32,17 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca - Affiliation: name of organization or affiliation - **Build file**: Build file(s) of the project. For example, files used to create a Docker image for the target software, package files, etc. - **Citation**: Preferred citation as the authors have stated in their readme file. SOMEF recognizes Bibtex, Citation File Format files and other means by which authors cite their papers (e.g., by in-text citation). -SOMEF now generates two separate entries: one for the software and another for the preferred citation which corresponds to the reference_publication category. This ensures metadata like DOI or version is correctly assigned to each entity. +For CITATION.cff files, SOMEF now generates two separate entries: one for the software (is_preferred_citation: False) and another for the preferred citation (is_preferred_citation: True). This ensures metadata like DOI or version is correctly assigned to each entity. We aim to recognize the following properties: - Title: Title of the publication - Author: list of author names in the publication - URL: URL of the publication - DOI: Digital object identifier of the publication - Date published - - License: Software license (if applicable) - Version: Software version (if applicable) + - Journal: Journal name where the paper was published + - Year: Year of publication + - Pages: Page range in the journal - **Code of conduct**: Link to the code of conduct of the project - **Code repository**: Link to the GitHub/GitLab repository used for the extraction - **Contact**: Contact person responsible for maintaining a software component @@ -66,7 +68,7 @@ We aim to recognize the following properties: - **Invocation**: Execution command(s) needed to run a scientific software component - **Issue tracker**: Link where to open issues for the target repository - **Keywords**: set of terms used to commonly identify a software component -- **License**: License and usage terms of a software component +- **License**: License and usage terms of a software component. Now we also extract license from citation.cff. - **Logo**: Main logo used to represent the target software component - **Name**: Name identifying a software component - **Ontologies**: URL and path to the ontology files present in the repository @@ -74,16 +76,6 @@ We aim to recognize the following properties: - **Package distribution**: Links to package sites like pypi in case the repository has a package available. - **Package files**: Links to package files used to wrap the project in a package. - **Programming languages**: Languages used in the repository -- **Reference publication**: Scholarly works (e.g., articles, books) associated with the software that should be cited, either instead of or in addition to the software itself. These references can be extracted from multiple sources, such as CITATION.cff files (e.g., `preferred-citation`), BibTeX entries, or citation text in documentation (e.g., README files). -We aim to recognize the following properties across these categories: - - Title: Title of the publication - - Author: list of author names in the publication - - URL: URL of the publication - - DOI: Digital object identifier of the publication - - Date published - - Journal: Journal name where the paper was published - - Year: Year of publication - - Pages: Page range in the journal - **Related papers**: URL to possible related papers within the repository stated within the readme file (from Arxiv) - **Releases** (GitHub and Gitlab): Pointer to the available versions of a software component. For each release, somef will track the following properties: - Assets: files attached to the release diff --git a/docs/output.md b/docs/output.md index ded5f641..844634eb 100644 --- a/docs/output.md +++ b/docs/output.md @@ -119,7 +119,7 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `usage`: Usage examples and considerations of a code repository. - `workflows`: URL and path to the computational workflow files present in the repository. - `homepage`: URL to the homepage of the software or organization. -- `reference_publication`: Scholarly publications associated with the repository (e.g., articles, books). Each entry may include structured metadata such as title, authors, DOI, URL, journal, publication date, and pagination. +- `reference_publication`: URL to the paper associated with the code repository. - `package_id`: Identifier extracted from packages. (e.g., `packages.json`) - `funding`: Funding code for the related project. - `has_package_file`: Specifies what package file is present in the code repository. diff --git a/src/somef/export/json_export.py b/src/somef/export/json_export.py index 6bfbea67..22444425 100644 --- a/src/somef/export/json_export.py +++ b/src/somef/export/json_export.py @@ -391,20 +391,17 @@ def format_date(date_string): # codemeta_output[constants.CAT_CODEMETA_AUTHOR].append(author_l) - if constants.CAT_REFERENCE_PUBLICATION in repo_data or constants.CAT_CITATION in repo_data: + if constants.CAT_CITATION in repo_data: codemeta_output[constants.CAT_CODEMETA_REFERENCEPUBLICATION] = [] author_orcids = {} all_reference_publications = [] - if constants.CAT_REFERENCE_PUBLICATION in repo_data: - publications_source = repo_data[constants.CAT_REFERENCE_PUBLICATION] - elif constants.CAT_CITATION in repo_data: + if constants.CAT_CITATION in repo_data: publications_source = repo_data[constants.CAT_CITATION] else: publications_source = [] if publications_source: - # for cit in repo_data[constants.CAT_REFERENCE_PUBLICATION]: for cit in publications_source: scholarlyArticle = {"@type": "ScholarlyArticle"} @@ -511,8 +508,8 @@ def format_date(date_string): key = (family_name.lower(), given_name.lower()) if given_name else None if key and key in author_orcids: - author["@id"] = author_orcids[key] - + author["@id"] = author_orcids[key] + codemeta_output[constants.CAT_CODEMETA_REFERENCEPUBLICATION] = deduplicate_publications(all_reference_publications) if constants.CAT_STATUS in repo_data: diff --git a/src/somef/parser/codemeta_parser.py b/src/somef/parser/codemeta_parser.py index a3f6028e..889de956 100644 --- a/src/somef/parser/codemeta_parser.py +++ b/src/somef/parser/codemeta_parser.py @@ -486,25 +486,9 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source): for pub in ref_publications: pub_data = parse_referenced_publication(pub) - if pub_data: - - result_dict = { - "value": pub_data.get("title", ""), - "title": pub_data.get("title", ""), - "type": constants.SCHOLARLY_ARTICLE - } - - if pub_data.get("url"): - result_dict["url"] = pub_data.get("url") - - if pub_data.get("date_published"): - result_dict["date_published"] = pub_data.get("date_published") - - if pub_data.get("identifier"): - result_dict["doi"] = pub_data.get("identifier") - + result_dict = map_reference_publication(pub_data) + if result_dict: metadata_result.add_result( - # constants.CAT_REF_PUBLICATION, constants.CAT_CITATION, result_dict, 1, @@ -514,24 +498,9 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source): elif isinstance(ref_publications, dict): pub_data = parse_referenced_publication(ref_publications) - if pub_data: - result_dict = { - "value": pub_data.get("title", ""), - "title": pub_data.get("title", ""), - "type": constants.SCHOLARLY_ARTICLE - } - - if pub_data.get("url"): - result_dict["url"] = pub_data.get("url") - - if pub_data.get("date_published"): - result_dict["date_published"] = pub_data.get("date_published") - - if pub_data.get("identifier"): - result_dict["doi"] = pub_data.get("identifier") - + result_dict = map_reference_publication(pub_data) + if result_dict: metadata_result.add_result( - # constants.CAT_REF_PUBLICATION, constants.CAT_CITATION, result_dict, 1, @@ -540,7 +509,6 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source): ) else: metadata_result.add_result( - # constants.CAT_REF_PUBLICATION, constants.CAT_CITATION, { "value": data["referencePublication"], @@ -780,3 +748,63 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source): return metadata_result +def map_codemeta_author(author): + given = author.get("givenName") + family = author.get("familyName") + name = author.get("name") + + if not name and (given or family): + name = f"{given or ''} {family or ''}".strip() + + mapped = { + constants.PROP_TYPE: constants.AGENT, + constants.PROP_NAME: name, + constants.PROP_GIVEN_NAME: given, + constants.PROP_FAMILY_NAME: family + } + + identifier = author.get("identifier") or author.get("@id") + if isinstance(identifier, str) and "orcid.org" in identifier: + mapped[constants.PROP_URL] = identifier + + return {k: v for k, v in mapped.items() if v is not None} + +def map_reference_publication(pub_data): + if not pub_data: + return None + + result = { + constants.PROP_VALUE: pub_data.get("title", ""), + constants.PROP_TITLE: pub_data.get("title", ""), + constants.PROP_TYPE: constants.SCHOLARLY_ARTICLE + } + + if pub_data.get("url"): + result[constants.PROP_URL] = pub_data.get("url") + + if pub_data.get("date_published"): + result[constants.PROP_DATE_PUBLISHED] = pub_data.get("date_published") + + if pub_data.get("identifier"): + result[constants.PROP_DOI] = pub_data.get("identifier") + + authors_raw = pub_data.get("author") + + if authors_raw: + if isinstance(authors_raw, dict): + authors_iter = [authors_raw] + elif isinstance(authors_raw, list): + authors_iter = authors_raw + else: + authors_iter = [] + + mapped_authors = [ + map_codemeta_author(a) + for a in authors_iter + if isinstance(a, dict) + ] + + result["authors"] = mapped_authors + + return result + diff --git a/src/somef/process_files.py b/src/somef/process_files.py index 62c08b99..a493e123 100644 --- a/src/somef/process_files.py +++ b/src/somef/process_files.py @@ -594,7 +594,7 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul pref_result[constants.PROP_VALUE] = yaml.dump({"preferred-citation": pref}, default_flow_style=False) # pref_result[constants.PROP_TYPE] = constants.FILE_DUMP metadata_result.add_result( - constants.CAT_REFERENCE_PUBLICATION, pref_result, 1, + constants.CAT_CITATION, pref_result, 1, constants.TECHNIQUE_FILE_EXPLORATION, url ) @@ -731,7 +731,7 @@ def parse_cff_root(yaml_content, metadata_result, url): if identifiers: result[constants.PROP_IDENTIFIER] = identifiers - # result[constants.PROP_PREFERRED] = "False" + result[constants.PROP_PREFERRED_CITATION] = "False" result[constants.PROP_FORMAT] = "cff" return clean_nulls(result) @@ -750,7 +750,7 @@ def parse_cff_preferred(pref): # cff_type = pref.get("type") # result[constants.PROP_TYPE] = cff_type if cff_type else constants.FILE_DUMP - # result[constants.PROP_PREFERRED] = "True" + result[constants.PROP_PREFERRED_CITATION] = "True" result[constants.PROP_FORMAT] = "cff" return clean_nulls(result) diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py index d7dcf464..d140bee1 100644 --- a/src/somef/test/test_JSON_export.py +++ b/src/somef/test/test_JSON_export.py @@ -614,14 +614,13 @@ def test_new_properties_citation_issue_935(self): json_content = json.load(f) citations = json_content.get(constants.CAT_CITATION, []) - referencePublications = json_content.get(constants.CAT_REFERENCE_PUBLICATION, []) software_entry = next( - (cit for cit in citations if cit["result"]["title"] == 'SOMEF: Software metadata extraction framework'), + (cit for cit in citations if str(cit["result"].get("is_preferred_citation")) == "False"), None ) preferred_entry = next( - (ref for ref in referencePublications if ref["result"]["title"] == "A Framework for Creating Knowledge Graphs of Scientific Software Metadata"), + (cit for cit in citations if str(cit["result"].get("is_preferred_citation")) == "True"), None ) diff --git a/src/somef/test/test_codemeta_parser.py b/src/somef/test/test_codemeta_parser.py index de36c783..ce97ac5c 100644 --- a/src/somef/test/test_codemeta_parser.py +++ b/src/somef/test/test_codemeta_parser.py @@ -46,6 +46,8 @@ def test_parse_multiple_codemeta_files(self): for cat_name, expected_val in expected.items(): cat_const = getattr(constants, cat_name) actual_list = metadata_result.results.get(cat_const, []) + # print(f"Actual list for {cat_name}: {actual_list}") + self.assertTrue( actual_list, f"[{repo_folder}] No results for {cat_name}" @@ -93,5 +95,28 @@ def test_parse_contributors(self): )) + def test_parse_reference_publications_authors_issue_957(self): + """ + Test to ensure that authors in the citation category correctly use the 'given_name' and + 'family_name' properties instead of the old camelCase convention. + """ + codemeta_path = REPOS_DIR / "widoco" / "codemeta.json" + result = Result() + + metadata_result = parse_codemeta_json_file(codemeta_path, result, "https://example.org/codemeta.json") + + self.assertIn(constants.CAT_CITATION, metadata_result.results) + citations = result.results[constants.CAT_CITATION] + found = False + + for cit in citations: + authors = cit["result"].get("authors", []) + if any(a.get("name") == "Daniel Garijo" and a.get("family_name") == "Garijo" and a.get("given_name") == "Daniel" for a in authors): + found = True + break + + self.assertTrue(found, "Author 'Daniel Garijo' with 'given_name' not found in citation authors") + + if __name__ == "__main__": unittest.main() diff --git a/src/somef/test/test_process_repository.py b/src/somef/test/test_process_repository.py index 4ee8fafa..2efa9581 100644 --- a/src/somef/test/test_process_repository.py +++ b/src/somef/test/test_process_repository.py @@ -190,10 +190,8 @@ def test_issue_526(self): github_data = Result() text, github_data = process_files.process_repository_files(test_data_repositories + "Widoco", github_data, constants.RepositoryType.LOCAL) - # after solving issue refernce_publication it must be 2 citations in results citation. - # assert len(github_data.results[constants.CAT_CITATION]) == 1 - assert len(github_data.results[constants.CAT_CITATION]) == 2 - assert len(github_data.results[constants.CAT_REFERENCE_PUBLICATION]) == 1 + # after solving issue refernce_publication it must be 3 citation. 1 should the preferred one from the cff file, + assert len(github_data.results[constants.CAT_CITATION]) == 3 def test_issue_530(self): """ diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index a048aee2..20429a5b 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -158,7 +158,7 @@ """ CAT_PROGRAMMING_LANGUAGES = "programming_languages" CAT_README_URL = "readme_url" -CAT_REFERENCE_PUBLICATION = "reference_publication" +# CAT_REFERENCE_PUBLICATION = "reference_publication" CAT_RELATED_DOCUMENTATION = "related_documentation" CAT_RELATED_PAPERS = "related_papers" CAT_RELEASES = "releases" @@ -241,6 +241,7 @@ PROP_ORIGINAL_HEADER = "original_header" PROP_PAGES = "pages" PROP_PARENT_HEADER = "parent_header" +PROP_PREFERRED_CITATION = "is_preferred_citation" PROP_RELEASE_ID = "release_id" PROP_ROLE = "role" PROP_SIZE = "size" From d23cae9788d82d3e218c9180bd74975e7095305a Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Fri, 10 Apr 2026 16:12:32 +0200 Subject: [PATCH 04/12] upperCase typo error writting "Widoco" in a test --- src/somef/test/test_codemeta_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/somef/test/test_codemeta_parser.py b/src/somef/test/test_codemeta_parser.py index ce97ac5c..c9c9f769 100644 --- a/src/somef/test/test_codemeta_parser.py +++ b/src/somef/test/test_codemeta_parser.py @@ -100,7 +100,7 @@ def test_parse_reference_publications_authors_issue_957(self): Test to ensure that authors in the citation category correctly use the 'given_name' and 'family_name' properties instead of the old camelCase convention. """ - codemeta_path = REPOS_DIR / "widoco" / "codemeta.json" + codemeta_path = REPOS_DIR / "Widoco" / "codemeta.json" result = Result() metadata_result = parse_codemeta_json_file(codemeta_path, result, "https://example.org/codemeta.json") From 78055b19647fae77e42247ffa0b6475d5267679f Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Mon, 13 Apr 2026 16:04:43 +0200 Subject: [PATCH 05/12] Standardize license fields and consolidate results. Fixes #955 --- src/somef/export/json_export.py | 27 ++++++++++++----- src/somef/parser/codemeta_parser.py | 28 ++++++++++++++--- src/somef/process_files.py | 4 ++- src/somef/process_repository.py | 16 +++++++--- src/somef/regular_expressions.py | 15 +++++++-- src/somef/test/test_JSON_export.py | 47 +++++++++++++++++++++++++++-- src/somef/utils/constants.py | 3 +- 7 files changed, 116 insertions(+), 24 deletions(-) diff --git a/src/somef/export/json_export.py b/src/somef/export/json_export.py index 22444425..12206823 100644 --- a/src/somef/export/json_export.py +++ b/src/somef/export/json_export.py @@ -859,16 +859,29 @@ def unify_results(repo_data: dict) -> dict: value = result.get(constants.PROP_VALUE) value_type = result.get(constants.PROP_TYPE) - canonical = canonicalize_value(value, value_type) + # --- SPECIAL LOGIC FOR LICENSES --- + if category == constants.CAT_LICENSE and result.get(constants.PROP_SPDX_ID): + # If we have SPDX, that is our unification key + key = f"LICENSE-{result[constants.PROP_SPDX_ID]}" + else: + # Normal behavior for the rest of the categories + canonical = canonicalize_value(value, value_type) + key = str(canonical) + # -------------------------------------------------- + # canonical = canonicalize_value(value, value_type) - key = str(canonical) + # key = str(canonical) if key in seen: existing = seen[key] - - # If types match, merge normally - existing[constants.PROP_RESULT][constants.PROP_VALUE] = choose_more_general( - existing[constants.PROP_RESULT][constants.PROP_VALUE], value - ) + if category == constants.CAT_LICENSE: + # prefer SPDX ID if available for licenses + if result.get(constants.PROP_SPDX_ID): + existing[constants.PROP_RESULT][constants.PROP_VALUE] = result[constants.PROP_SPDX_ID] + else: + # If types match, merge normally + existing[constants.PROP_RESULT][constants.PROP_VALUE] = choose_more_general( + existing[constants.PROP_RESULT][constants.PROP_VALUE], value + ) # merge other result fields because different techniques might have extracted different information # (e.g., email in authors extracted by file exploration or code parser. diff --git a/src/somef/parser/codemeta_parser.py b/src/somef/parser/codemeta_parser.py index 889de956..11d33189 100644 --- a/src/somef/parser/codemeta_parser.py +++ b/src/somef/parser/codemeta_parser.py @@ -63,9 +63,21 @@ def parse_license(license_data): spdx_id = identifier.split("spdx.org/licenses/")[-1].split("/")[0] license_info["spdx_id"] = spdx_id elif isinstance(license_data, str): - license_info["name"] = license_data - license_info["identifier"] = f"https://spdx.org/licenses/{license_data}" - license_info["spdx_id"] = license_data + # license_info["name"] = license_data + # license_info["identifier"] = f"https://spdx.org/licenses/{license_data}" + # license_info["spdx_id"] = license_data + license_str = license_data.strip() + + if "spdx.org/licenses/" in license_str: + # Already a full SPDX URL + license_info["identifier"] = license_str + license_info["name"] = license_str.split("/")[-1] + license_info["spdx_id"] = license_info["name"] + else: + # we assume it's an spdx id like "MIT" + license_info["name"] = license_str + license_info["identifier"] = f"https://spdx.org/licenses/{license_str}" + license_info["spdx_id"] = license_str else: return None return license_info @@ -680,10 +692,16 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source): ) if "license" in data: - license_info = parse_license(data["license"]) + license_raw = data["license"] + license_info = parse_license(license_raw) if license_info: + if isinstance(license_raw, str): + val_lic = license_raw + else: + val_lic = license_info.get("name", "") + result_dict = { - "value": license_info.get("name", ""), + "value": val_lic, "type": constants.LICENSE } diff --git a/src/somef/process_files.py b/src/somef/process_files.py index a493e123..e467f016 100644 --- a/src/somef/process_files.py +++ b/src/somef/process_files.py @@ -506,7 +506,9 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul if license_info: result[constants.PROP_NAME] = license_info['name'] result[constants.PROP_SPDX_ID] = license_info['spdx_id'] - + if '@id' in license_info: + result[constants.PROP_URL] = license_info['@id'] + result[constants.PROP_IDENTIFIER] = license_info['@id'] # Extraction copyright holder from license text matches_copyright = re.findall(constants.REGEXP_COPYRIGHT, license_text, flags=re.IGNORECASE) diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index 79932aae..7b4235e8 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -330,10 +330,11 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url): license_result[constants.PROP_NAME] = general_resp["license"]["name"] if "url" in general_resp['license']: license_result[constants.PROP_VALUE] = general_resp["license"]["url"] - - # for k in ('name', 'url'): - # if k in general_resp['license']: - # license_info[k] = general_resp['license'][k] + temp_info_lic = detect_license_spdx(general_resp["license"]["name"], 'JSON') + if temp_info_lic: + license_result[constants.PROP_SPDX_ID] = temp_info_lic['spdx_id'] + license_result[constants.PROP_URL] = temp_info_lic['url'] + license_result[constants.PROP_IDENTIFIER] = temp_info_lic['identifier'] # If we didn't find it, look for the license if constants.PROP_VALUE not in license_result or license_result[constants.PROP_VALUE] is None: @@ -347,6 +348,7 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url): if license_info: license_result[constants.PROP_NAME] = license_info['name'] license_result[constants.PROP_SPDX_ID] = license_info['spdx_id'] + license_result[constants.PROP_IDENTIFIER] = license_info['identifier'] if constants.PROP_VALUE in license_result: repo_metadata.add_result(constants.CAT_LICENSE, license_result, 1, constants.TECHNIQUE_GITLAB_API) @@ -646,7 +648,11 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, constants.PROP_URL: value["url"] } if "spdx_id" in value.keys(): - result[constants.PROP_SPDX_ID] = value["spdx_id"] + spdx_id = value["spdx_id"] + spdx_url = f"https://spdx.org/licenses/{spdx_id}" + result[constants.PROP_SPDX_ID] = spdx_id + result[constants.PROP_URL] = spdx_url + result[constants.PROP_IDENTIFIER] = spdx_url elif category == constants.CAT_OWNER: result = { constants.PROP_VALUE: value, diff --git a/src/somef/regular_expressions.py b/src/somef/regular_expressions.py index 286705ff..f11a34ff 100644 --- a/src/somef/regular_expressions.py +++ b/src/somef/regular_expressions.py @@ -1013,24 +1013,33 @@ def detect_license_spdx(license_text, type): for license_name, license_info in constants.LICENSES_DICT.items(): if re.search(license_info["regex"], license_text, re.IGNORECASE): + spdx_id = license_info['spdx_id'] + spdx_url = f"https://spdx.org/licenses/{spdx_id}" if type == 'JSON': return { "name": license_name, "spdx_id": f"{license_info['spdx_id']}", - "@id": f"https://spdx.org/licenses/{license_info['spdx_id']}" + "@id": spdx_url, + "url": spdx_url, + "identifier": spdx_url } else: return { "name": license_name, - "identifier": f"https://spdx.org/licenses/{license_info['spdx_id']}" + "identifier": spdx_url, + "spdx_id": spdx_id, + "url": spdx_url } for license_name, license_info in constants.LICENSES_DICT.items(): spdx_id = license_info["spdx_id"] if re.search(rf'\b{re.escape(spdx_id)}\b', license_text, re.IGNORECASE): + spdx_url = f"https://spdx.org/licenses/{spdx_id}" return { "name": license_name, "spdx_id": spdx_id, - "@id": f"https://spdx.org/licenses/{spdx_id}" + "@id": spdx_url, + "identifier": spdx_url, + "url": spdx_url } return None diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py index d140bee1..b379a1cf 100644 --- a/src/somef/test/test_JSON_export.py +++ b/src/somef/test/test_JSON_export.py @@ -767,9 +767,52 @@ def test_issue_886_apache(self): data = text_file.read() text_file.close() json_content = json.loads(data) - copyright_entries = json_content[constants.CAT_COPYRIGHT] copy = copyright_entries[0]["result"] assert copy["value"] == "Daniel Garijo, Information Sciences Institute, USC." assert copy["year"] == "2016" - os.remove(test_data_path + "test_issue_886_apache.json") \ No newline at end of file + os.remove(test_data_path + "test_issue_886_apache.json") + + + def test_issue_955_license_consolidation(self): + """Checks whether licenses are correctly consolidated and enriched with SPDX metadata""" + output_path = test_data_path + "test_issue_955_license_consolidation.json" + + somef_cli.run_cli(threshold=0.8, + ignore_classifiers=False, + repo_url=None, + local_repo=test_data_repositories + "Widoco", + doc_src=None, + in_file=None, + output=output_path, + graph_out=None, + graph_format="turtle", + codemeta_out=None, + pretty=True, + missing=False, + readme_only=False) + + with open(output_path, "r") as text_file: + json_content = json.loads(text_file.read()) + + assert constants.CAT_LICENSE in json_content + license_entries = json_content[constants.CAT_LICENSE] + + assert len(license_entries) == 1 + + license_res = license_entries[0]["result"] + + assert license_res["value"] == "Apache-2.0" + assert license_res["spdx_id"] == "Apache-2.0" + assert license_res["name"] == "Apache License 2.0" + assert license_res["url"] == "https://spdx.org/licenses/Apache-2.0" + assert license_res["identifier"] == "https://spdx.org/licenses/Apache-2.0" + + assert isinstance(license_entries[0]["technique"], list) + assert "file_exploration" in license_entries[0]["technique"] + assert "code_parser" in license_entries[0]["technique"] + + assert isinstance(license_entries[0]["source"], list) + assert len(license_entries[0]["source"]) >= 2 + + os.remove(output_path) \ No newline at end of file diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 20429a5b..5566b717 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -64,7 +64,8 @@ # REGEXP_APACHE = r'(?i)apache\s+license\s*,?\s*version\s*2\.0' REGEXP_APACHE = r'(?i)apache(?:\s+license)?\s*(?:,?\s*version\s*)?2\.0' REGEXP_GPL3 = r'(?i)gnu\s+general\s+public\s+license\s*,?\s*version\s*3\.0' -REGEXP_MIT = r'(?i)mit\s+license' +# REGEXP_MIT = r'(?i)mit\s+license' +REGEXP_MIT = r'(?i)(mit\s+license|permission\s+is\s+hereby\s+granted|THE\s+SOFTWARE\s+IS\s+PROVIDED\s+"AS\s+IS")' REGEXP_BSD2 = r'(?i)(bsd\s*-?\s*2-?clause(?:\s*license)?|redistribution\s+and\s+use\s+in\s+source\s+and\s+binary\s+forms)' REGEXP_BSD3 = r'(?i)bsd\s+3-clause\s+license' REGEXP_BOOST = r'(?i)boost\s+software\s+license\s*,?\s*version\s*1\.0' From 3d84e8f21d10aec6938e0e4b22619b5e87824d94 Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Tue, 14 Apr 2026 09:36:07 +0200 Subject: [PATCH 06/12] fix documentation consistency and order tables. Fixes #954 --- docs/output.md | 70 ++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/docs/output.md b/docs/output.md index 844634eb..e6f70fb5 100644 --- a/docs/output.md +++ b/docs/output.md @@ -68,7 +68,7 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `acknowledgement`: Any text that the authors have prepared to acknnowledge the contribution from others, or project funding. - `application_domain`: The application domain of the repository. This may be related to the research area of a software component (e.g., Astrophysics) or the general domain/functionality of the tool (i.e., machine learning projects). See all current recognized application domains [here](https://somef.readthedocs.io/en/latest/#myfootnote1). - `authors`: Person or organization responsible of the project. This property is also used to indicate the responsible entities of a publication associated with the code repository. -- `citation`: Software citation (usually in `.bib` form) as the authors have stated in their readme file, or through a `CFF` file. +- `citation`: Software citation (usually in .bib or .cff format). SOMEF extracts and structures the metadata from these files (including authors, titles, and DOIs) instead of just returning a raw string. - `code_of_conduct`: Link to the code of conduct file of the project - `code_repository`: Link to the source code (typically the repository where the readme can be found) - `contact`: Contact person responsible for maintaining a software component. @@ -88,7 +88,9 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `forks_url`: Links to forks made of the project (GitHub only) - `full_name`: Name + owner (owner/name) (if available) - `full_title`: If the repository has a short name, we will attempt to extract the longer version of the repository name. For example, a repository may be called "Widoco", but the longer title is "Wizard for documenting ontologies". +- `funding`: Funding code for the related project. - `has_build_file`: Build file to create a Docker image for the target software +- `has_package_file`: Specifies what package file is present in the code repository. - `has_script_file`: Snippets of code contained in the repository. - `homepage`: URL of the item. - `identifier`: Identifiers detected within a repository (e.g., Digital Object Identifier). @@ -105,8 +107,10 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `owner`: Name of the user or organization in charge of the repository - `package_distribution`: Link to official package repositories where the software can be downloaded from (e.g., `pypi`). - `package_file`: Link to a package file used in the repository (e.g., `pyproject.toml`, `setup.py`). +- `package_id`: Identifier extracted from packages. (e.g., `packages.json`) - `programming_languages`: Languages used in the repository. - `readme_url`: URL to the main README file in the repository. +- `reference_publication`: URL to the paper associated with the code repository. - `related_papers`: URL to possible related papers within the repository stated within the readme file. - `releases`: Pointer to the available versions of a software component. - `repository_status`: Repository status as it is described in [repostatus.org](https://www.repostatus.org/). @@ -118,11 +122,7 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `type`: Software type: Commandline Application, Notebook Application, Ontology, Scientific Workflow. Non-Software types: Static Website, Uncategorized - `usage`: Usage examples and considerations of a code repository. - `workflows`: URL and path to the computational workflow files present in the repository. -- `homepage`: URL to the homepage of the software or organization. -- `reference_publication`: URL to the paper associated with the code repository. -- `package_id`: Identifier extracted from packages. (e.g., `packages.json`) -- `funding`: Funding code for the related project. -- `has_package_file`: Specifies what package file is present in the code repository. + The following table summarized the properties used to describe a `category`: @@ -211,22 +211,24 @@ The table below summarizes all types and their corresponding properties: | **zipball_url** | Release | Url | URL to the zip file where to download a software release | --> -The tables below summarizes all types and their corresponding properties- +The tables below summarizes all types and their corresponding properties. +The following object types are currently supported (aligned with Schema.org and CodeMeta vocabularies) -An AGENT has the following properties: +An Agent has the following properties: | Property | Expected value | Definition | |---|---|---| +| **affiliation** | String | name of organization or affiliation | | **email** | String | Email of an author | | **family_name** | String | Last name of an author | | **given_name** | String | First name of an author | +| **identifier** | String | id of an agent | | **name** | String | Name used to designate the person or organization| +| **role** | String | The role of the agent in the development or maintenance of this software component | | **url** | Url | Uniform resource locator of the resource | -| **affiliation** | String | name of organization or affiliation | -| **identifier** | String | id of an agent | -| **role** | String | role of agent | -An ASSET has the following properties: + +An Asset has the following properties: | Property | Expected value | Definition | |---|---|---| @@ -239,17 +241,17 @@ An ASSET has the following properties: | **url** | Url | Uniform resource locator of the resource | - -A LICENSE has the following properties: +A License has the following properties: | Property | Expected value | Definition | |---|---|---| +| **identifier** | String | id of licence | | **name** | String | Title or name of the license | | **spdx_id** | String | Spdx id corresponding to this license | | **url** | Url | Uniform resource locator of the license | -| **identifier** | String | id of licence | -A PROGRAMMING_LANGUAGE has the following properties: + +A Programming_language has the following properties: | Property | Expected value | Definition | |---|---|---| @@ -257,7 +259,7 @@ A PROGRAMMING_LANGUAGE has the following properties: | **size** | Integer | File size content (bytes) of a code repository using a given programming language | -A PUBLICATION has the following properties: +A Publication has the following properties: | Property | Expected value | Definition | |---|---|---| @@ -267,7 +269,7 @@ A PUBLICATION has the following properties: | **url** | Url | Uniform resource locator of the resource | -A RELEASE has the following properties: +A Release has the following properties: | Property | Expected value | Definition | |---|---|---| @@ -285,49 +287,51 @@ A RELEASE has the following properties: | **zipball_url** | Url | URL to the zip file where to download a software release | - A REQUIREMENT has the following properties: + A Requirement has the following properties: | Property | Expected value | Definition | |---|---|---| -| **name** | String | Name of the requeriment | -| **version** | String | named version of a requeriment | | **dependency_type** | String | type: dev, runtime... Indicates whether the dependency is required at runtime or only for development/testing | | **dependency_resolver** | String | Identifies the ecosystem or package manager that resolves the dependency (e.g., npm, bower, pip, python, poetry, pdm, cargo, julia, maven, publicode).| +| **name** | String | Name of the requeriment | +| **version** | String | named version of a requeriment | + -A RUNTIME_PLATFORM has the following properties: +A Runtime_platform has the following properties: | Property | Expected value | Definition | |---|---|---| | **name** | String | Name of the runtime platform (e.g., Java) | - **version** | String | version of the runtime platform | | **value** | String | name and version of the runtime platform | +| **version** | String | version of the runtime platform | -A SCHOLARLY_ARTICLE has the following properties: +A Scholarly_article has the following properties: | Property | Expected value | Definition | |---|---|---| -| **title** | String | Title of reference or citation | -| **authors** | List | List of authors with structured information (name, given_name, family_name) | +| **authors** | List of Agent| List of authors responsible for the publication, providing structured metadata for each | +| **date_published** | String | Date when the article or citation was officially published. | +| **doi** | String | Digital Object Identifier (DOI) of the reference, usually returned as a full URL.| | **journal** | String | Journal where the publication appeared | -| **year** | Number | Year of publication | | **pages** | String | Page range of the publication | -| **value** | String | Title of reference or citation | +| **title** | String | Title of reference or citation | | **url** | String | Link to reference or citation | -| **date_published** | String | date of publication reference or citation | -| **doi** | String | Identifier of reference| +| **value** | String | Title of reference or citation | +| **year** | Number | Year of publication | -A SOFTWARE_APPLICATION has the following properties: +A Software_application has the following properties: | Property | Expected value | Definition | |---|---|---| +| **development_type** | String | runtime or dev | | **name** | String | Name of the software | | **value** | String | Name and version of the software | | **version** | String | version of software | -| **development_type** | String | runtime or dev | -A TEXT_EXCERPT has the following properties: + +A Text_excerpt has the following properties: | Property | Expected value | Definition | |---|---|---| From 4ddc0725a52deb5a38f5d988708841184f7eba04 Mon Sep 17 00:00:00 2001 From: Daniel Garijo Date: Tue, 14 Apr 2026 11:01:50 +0200 Subject: [PATCH 07/12] Apply suggestion from @dgarijo --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index a01fa373..301149da 100644 --- a/docs/index.md +++ b/docs/index.md @@ -31,7 +31,7 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca - URL: website or ORCID associated with the author - Affiliation: name of organization or affiliation - **Build file**: Build file(s) of the project. For example, files used to create a Docker image for the target software, package files, etc. -- **Citation**: Preferred citation as the authors have stated in their readme file. SOMEF recognizes Bibtex, Citation File Format files and other means by which authors cite their papers (e.g., by in-text citation). +- **Citation**: Preferred citation(s) as the authors have stated in their readme file. SOMEF recognizes Bibtex, Citation File Format files and other means by which authors cite their papers (e.g., by in-text citation). For CITATION.cff files, SOMEF now generates two separate entries: one for the software (is_preferred_citation: False) and another for the preferred citation (is_preferred_citation: True). This ensures metadata like DOI or version is correctly assigned to each entity. We aim to recognize the following properties: - Title: Title of the publication From c9e0564c059cdbf41ba08d1ee558cbee3630b1c3 Mon Sep 17 00:00:00 2001 From: juanjemdIos <116972173+juanjemdIos@users.noreply.github.com> Date: Tue, 14 Apr 2026 11:18:14 +0200 Subject: [PATCH 08/12] Update docs/index.md Co-authored-by: Daniel Garijo --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 301149da..b2c10f63 100644 --- a/docs/index.md +++ b/docs/index.md @@ -33,7 +33,7 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca - **Build file**: Build file(s) of the project. For example, files used to create a Docker image for the target software, package files, etc. - **Citation**: Preferred citation(s) as the authors have stated in their readme file. SOMEF recognizes Bibtex, Citation File Format files and other means by which authors cite their papers (e.g., by in-text citation). For CITATION.cff files, SOMEF now generates two separate entries: one for the software (is_preferred_citation: False) and another for the preferred citation (is_preferred_citation: True). This ensures metadata like DOI or version is correctly assigned to each entity. -We aim to recognize the following properties: +We recognize the following properties: - Title: Title of the publication - Author: list of author names in the publication - URL: URL of the publication From 5686a9eee863f9106dc3f12f509d5e900a2d1397 Mon Sep 17 00:00:00 2001 From: juanjemdIos <116972173+juanjemdIos@users.noreply.github.com> Date: Tue, 14 Apr 2026 11:18:31 +0200 Subject: [PATCH 09/12] Update docs/index.md Co-authored-by: Daniel Garijo --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index b2c10f63..e7590148 100644 --- a/docs/index.md +++ b/docs/index.md @@ -39,7 +39,7 @@ We recognize the following properties: - URL: URL of the publication - DOI: Digital object identifier of the publication - Date published - - Version: Software version (if applicable) + - Version: Software version (if applicable, i.e., the main citation is a software deposit) - Journal: Journal name where the paper was published - Year: Year of publication - Pages: Page range in the journal From cce43bfa08a8d9888238bed85aa264de4da900df Mon Sep 17 00:00:00 2001 From: juanjemdIos <116972173+juanjemdIos@users.noreply.github.com> Date: Tue, 14 Apr 2026 11:25:15 +0200 Subject: [PATCH 10/12] Update docs/index.md Co-authored-by: Daniel Garijo --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index e7590148..40a49a7c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -68,7 +68,7 @@ We recognize the following properties: - **Invocation**: Execution command(s) needed to run a scientific software component - **Issue tracker**: Link where to open issues for the target repository - **Keywords**: set of terms used to commonly identify a software component -- **License**: License and usage terms of a software component. Now we also extract license from citation.cff. +- **License**: License and usage terms of a software component. - **Logo**: Main logo used to represent the target software component - **Name**: Name identifying a software component - **Ontologies**: URL and path to the ontology files present in the repository From c5a74c50c3b3adc9013d708b245549f91551e4d3 Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Wed, 15 Apr 2026 08:32:13 +0200 Subject: [PATCH 11/12] resolve category inconsistencies and standardize properties and types naming. Fixes #951 --- README.md | 6 +- docs/condaenvironment.md | 4 +- docs/gemspec.md | 2 +- docs/index.md | 6 +- docs/output.md | 81 ++----- docs/packagejson.md | 2 +- docs/pom.md | 2 +- docs/publiccode.md | 4 +- docs/supported_metadata_files.md | 6 +- poetry.lock | 198 +++++++++--------- pyproject.toml | 2 +- src/somef/extract_software_type.py | 10 +- src/somef/parser/bower_parser.py | 4 +- src/somef/parser/cabal_parser.py | 2 +- src/somef/parser/codemeta_parser.py | 2 +- src/somef/parser/composer_parser.py | 2 +- src/somef/parser/conda_environment_parser.py | 4 +- src/somef/parser/description_parser.py | 2 +- src/somef/parser/gemspec_parser.py | 6 +- src/somef/parser/package_json_parser.py | 2 +- src/somef/parser/pom_xml_parser.py | 2 +- src/somef/parser/publiccode_parser.py | 2 +- src/somef/parser/python_parser.py | 2 +- src/somef/parser/toml_parser.py | 17 +- src/somef/process_files.py | 1 - src/somef/test/test_JSON_export.py | 5 +- src/somef/test/test_cli.py | 4 +- src/somef/test/test_codemeta_export.py | 2 - src/somef/test/test_data/expected/Widoco.yaml | 2 +- .../test/test_data/expected/gammapy.yaml | 2 +- src/somef/test/test_toml_parser.py | 2 +- src/somef/utils/constants.py | 11 +- 32 files changed, 175 insertions(+), 224 deletions(-) diff --git a/README.md b/README.md index 9625b85c..cbe23ebc 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,10 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo - Family name: Last name of an author - Email: email of author - URL: website or ORCID associated with the author +- **Application type**: type of software (command line application, notebook, ontology, scientific workflow, etc.) - **Build file**: Build file(s) of the project. For example, files used to create a Docker image for the target software, package files, etc. - **Citation**: Preferred citation as the authors have stated in their readme file. SOMEF recognizes Bibtex, Citation File Format files and other means by which authors cite their papers (e.g., by in-text citation). -For CITATION.cff files, SOMEF now generates two separate entries: one for the software (is_preferred_citation: False) and another for the preferred citation (is_preferred_citation: True). This ensures metadata like DOI or version is correctly assigned to each entity. +For CITATION.cff files, SOMEF now generates two separate entries: one for the software and another for the preferred citation (is_preferred_citation: True). This ensures metadata like DOI or version is correctly assigned to each entity. We aim to recognize the following properties: - Title: Title of the publication - Author: list of author names in the publication @@ -83,12 +84,11 @@ We aim to recognize the following properties: - **Repository status**: Repository status as it is described in [repostatus.org](https://www.repostatus.org/). - **Requirements**: Pre-requisites and dependencies needed to execute a software component - **Run**: Running instructions of a software component. It may be wider than the `invocation` category, as it may include several steps and explanations. -- **Runtime platform**: specifies runtime platform or script interpreter dependencies required to run the project.. +- **Runtime platform**: specifies the runtime environment or script interpreter dependencies (e.g., Python, Java). - **Script files**: Bash script files contained in the repository - **Stargazers count**: Total number of stargazers of the project - **Support**: Guidelines and links of where to obtain support for a software component - **Support channels**: Help channels one can use to get support about the target software component -- **Type**: type of software (command line application, notebook, ontology, scientific workflow, etc.) - **Usage examples**: Assumptions and considerations recorded by the authors when executing a software component, or examples on how to use it - **Workflows**: URL and path to the computational workflow files present in the repository diff --git a/docs/condaenvironment.md b/docs/condaenvironment.md index c5f638e2..e08f96b1 100644 --- a/docs/condaenvironment.md +++ b/docs/condaenvironment.md @@ -35,7 +35,7 @@ dependencies: "value": "python=3.8.5", "name": "python", "version": "3.8.5", - "type": "Software_application", + "type": "SoftwareDependency", "dependency_type": "runtime", "dependency_resolver": "conda" }, @@ -43,7 +43,7 @@ dependencies: "value": "albumentations==0.4.3", "name": "albumentations", "version": "0.4.3", - "type": "Software_application", + "type": "SoftwareDependency", "dependency_type": "runtime", "dependency_resolver": "pip" }, diff --git a/docs/gemspec.md b/docs/gemspec.md index 041145d1..b2a23023 100644 --- a/docs/gemspec.md +++ b/docs/gemspec.md @@ -79,7 +79,7 @@ spec.requirements = [ Result: add_depency -> type runtime; add_development_dependencyd -> type dev ``` - [{'result': {'value': 'railties: >= 3.0', 'name': 'railties', 'version': '>= 3.0', 'type': 'Software_application', 'dependency_type': 'runtime', 'dependency_resolver': 'bundler'}, 'confidence': 1, 'technique': 'code_parser', 'source': 'https://example.org/bootstrap-datepicker-rails.gemspec'}, {'result': {'value': 'bundler: >= 1.0', 'name': 'bundler', 'version': '>= 1.0', 'type': 'Software_application', 'dependency_type': 'dev','dependency_resolver': 'bundler'}, 'confidence': 1, 'technique': 'code_parser', 'source': 'https://example.org/bootstrap-datepicker-rails.gemspec'}] + [{'result': {'value': 'railties: >= 3.0', 'name': 'railties', 'version': '>= 3.0', 'type': 'SoftwareDependency', 'dependency_type': 'runtime', 'dependency_resolver': 'bundler'}, 'confidence': 1, 'technique': 'code_parser', 'source': 'https://example.org/bootstrap-datepicker-rails.gemspec'}, {'result': {'value': 'bundler: >= 1.0', 'name': 'bundler', 'version': '>= 1.0', 'type': 'SoftwareDependency', 'dependency_type': 'dev','dependency_resolver': 'bundler'}, 'confidence': 1, 'technique': 'code_parser', 'source': 'https://example.org/bootstrap-datepicker-rails.gemspec'}] ``` diff --git a/docs/index.md b/docs/index.md index a01fa373..17966e0b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -15,6 +15,7 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca - **Acknowledgement**: Text acknowledging funding sources or contributors - **Application domain**: The application domain of the repository. This may be related to the research area of a software component (e.g., Astrophysics) or the general domain/functionality of the tool (i.e., machine learning projects)[1](#myfootnote1) +- **Application type**: type of software (command line application, notebook, ontology, scientific workflow, etc.) - **Assets**: files attached to the release - url: URL of the publication of the file - name: name of the file @@ -32,7 +33,7 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca - Affiliation: name of organization or affiliation - **Build file**: Build file(s) of the project. For example, files used to create a Docker image for the target software, package files, etc. - **Citation**: Preferred citation as the authors have stated in their readme file. SOMEF recognizes Bibtex, Citation File Format files and other means by which authors cite their papers (e.g., by in-text citation). -For CITATION.cff files, SOMEF now generates two separate entries: one for the software (is_preferred_citation: False) and another for the preferred citation (is_preferred_citation: True). This ensures metadata like DOI or version is correctly assigned to each entity. +For CITATION.cff files, SOMEF now generates two separate entries: one for the software and another for the preferred citation (is_preferred_citation: True). This ensures metadata like DOI or version is correctly assigned to each entity. We aim to recognize the following properties: - Title: Title of the publication - Author: list of author names in the publication @@ -91,12 +92,11 @@ We aim to recognize the following properties: - **Repository status**: Repository status as it is described in [repostatus.org](https://www.repostatus.org/). - **Requirements**: Pre-requisites and dependencies needed to execute a software component - **Run**: Running instructions of a software component. It may be wider than the `invocation` category, as it may include several steps and explanations. -- **Runtime platform**: specifies runtime platform or script interpreter dependencies required to run the project. +- **Runtime platform**: specifies the runtime environment or script interpreter dependencies (e.g., Python, Java). - **Script files**: Bash script files contained in the repository - **Stargazers count**: Total number of stargazers of the project - **Support**: Guidelines and links of where to obtain support for a software component - **Support channels**: Help channels one can use to get support about the target software component -- **Type**: type of software (command line application, notebook, ontology, scientific workflow, etc.) - **Usage examples**: Assumptions and considerations recorded by the authors when executing a software component, or examples on how to use it - **Workflows**: URL and path to the computational workflow files present in the repository diff --git a/docs/output.md b/docs/output.md index e6f70fb5..42d055b4 100644 --- a/docs/output.md +++ b/docs/output.md @@ -67,6 +67,7 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `acknowledgement`: Any text that the authors have prepared to acknnowledge the contribution from others, or project funding. - `application_domain`: The application domain of the repository. This may be related to the research area of a software component (e.g., Astrophysics) or the general domain/functionality of the tool (i.e., machine learning projects). See all current recognized application domains [here](https://somef.readthedocs.io/en/latest/#myfootnote1). +- `application_type`: Software type: Commandline Application, Notebook Application, Ontology, Scientific Workflow. Non-Software types: Static Website, Uncategorized - `authors`: Person or organization responsible of the project. This property is also used to indicate the responsible entities of a publication associated with the code repository. - `citation`: Software citation (usually in .bib or .cff format). SOMEF extracts and structures the metadata from these files (including authors, titles, and DOIs) instead of just returning a raw string. - `code_of_conduct`: Link to the code of conduct file of the project @@ -116,10 +117,10 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `repository_status`: Repository status as it is described in [repostatus.org](https://www.repostatus.org/). - `requirements`: Pre-requisites and dependencies needed to execute a software component. - `run`: Running instructions of a software component. It may be wider than the `invocation` category, as it may include several steps and explanations. +- `runtime_platform`: Specifies the runtime environment or script interpreter dependencies required to run the project (e.g., Python, Java, Julia). - `stargazers_count`: Total number of stargazers of the project. - `support`: Guidelines and links of where to obtain support for a software component. - `support_channels`: Help channels one can use to get support about the target software component. -- `type`: Software type: Commandline Application, Notebook Application, Ontology, Scientific Workflow. Non-Software types: Static Website, Uncategorized - `usage`: Usage examples and considerations of a code repository. - `workflows`: URL and path to the computational workflow files present in the repository. @@ -171,9 +172,10 @@ The following object `types` are currently supported: - `Programming_language`: Programming language used in the repository. - `License`: object representing all the metadata SOMEF extracts from a license. - `Agent`: user (typically, a person) or organization responsible for authoring a software release or a paper. -- `Publication`: Scientific paper associated with the code repository. -- `SoftwareApplication`: Class to represent software dependencies between projects. -- `Runtime_platform`: specifies runtime platform or script interpreter dependencies required to run the project.. +- `ScholarlyArticle`: Scientific paper or article associated with the code repository. +- `SoftwareApplication`: Class to represent the main software component metadata. +- `SoftwareDependency`: Class to represent software dependencies and runtime platforms required to run the project. + The following literal types are currently supported: - `Number`: A numerical value. We do not distinguish between integer, long or float. @@ -184,33 +186,6 @@ The following literal types are currently supported: - `Url`: uniform resource locator of a file. - - - The tables below summarizes all types and their corresponding properties. The following object types are currently supported (aligned with Schema.org and CodeMeta vocabularies) @@ -259,16 +234,6 @@ A Programming_language has the following properties: | **size** | Integer | File size content (bytes) of a code repository using a given programming language | -A Publication has the following properties: - -| Property | Expected value | Definition | -|---|---|---| -| **author** | Agent, Organization | Person or organization responsible for creating an article or a software release. | -| **doi** | Url | When a publication is detected, but the format is in bibtek or CFF, SOMEF will add a `doi` field with the detected DOI value. The result includes a full URL. | -| **title** | String | Title of the publication | -| **url** | Url | Uniform resource locator of the resource | - - A Release has the following properties: | Property | Expected value | Definition | @@ -287,26 +252,7 @@ A Release has the following properties: | **zipball_url** | Url | URL to the zip file where to download a software release | - A Requirement has the following properties: - -| Property | Expected value | Definition | -|---|---|---| -| **dependency_type** | String | type: dev, runtime... Indicates whether the dependency is required at runtime or only for development/testing | -| **dependency_resolver** | String | Identifies the ecosystem or package manager that resolves the dependency (e.g., npm, bower, pip, python, poetry, pdm, cargo, julia, maven, publicode).| -| **name** | String | Name of the requeriment | -| **version** | String | named version of a requeriment | - - -A Runtime_platform has the following properties: - -| Property | Expected value | Definition | -|---|---|---| -| **name** | String | Name of the runtime platform (e.g., Java) | -| **value** | String | name and version of the runtime platform | -| **version** | String | version of the runtime platform | - - -A Scholarly_article has the following properties: +A ScholarlyArticle has the following properties: | Property | Expected value | Definition | |---|---|---| @@ -321,14 +267,17 @@ A Scholarly_article has the following properties: | **year** | Number | Year of publication | -A Software_application has the following properties: +A SoftwareApplication or SoftwareDependency has the following properties: | Property | Expected value | Definition | |---|---|---| -| **development_type** | String | runtime or dev | -| **name** | String | Name of the software | -| **value** | String | Name and version of the software | -| **version** | String | version of software | +| **dependency_type** | String | Indicates whether the dependency is required at runtime or only for development/testing (e.g., `dev`, `runtime`, `os`). | +| **dependency_resolver** | String | Identifies the ecosystem or package manager that resolves the dependency (e.g., `npm`, `pip`, `julia`, `conda`).| +| **is_preferred_citation** | Boolean | Set to `True` if the authors explicitly state this is the preferred citation. Omitted otherwise. | +| **name** | String | Name of the software, dependency, or runtime platform (e.g., "pandas", "python"). | +| **type** | String | The object type: `SoftwareApplication` (for the main repository) or `SoftwareDependency` (for requirements and platforms). | +| **value** | String | A string representation typically combining name and version. | +| **version** | String | The version or version range of the software/dependency. | A Text_excerpt has the following properties: diff --git a/docs/packagejson.md b/docs/packagejson.md index da5fcc10..42b868f6 100644 --- a/docs/packagejson.md +++ b/docs/packagejson.md @@ -80,7 +80,7 @@ or ... ``` Resutl: -```{'result': {'value': 'foo@1.0.0 - 2.9999.9999', 'name': 'foo', 'version': '1.0.0 - 2.9999.9999', 'type': 'Software_application'}, 'confidence': 1, 'technique': 'code_parser', 'source': 'http://example.com/package_neors.json'}``` +```{'result': {'value': 'foo@1.0.0 - 2.9999.9999', 'name': 'foo', 'version': '1.0.0 - 2.9999.9999', 'type': 'SoftwareDependency'}, 'confidence': 1, 'technique': 'code_parser', 'source': 'http://example.com/package_neors.json'}``` *(5)* - Example: diff --git a/docs/pom.md b/docs/pom.md index b03dcda7..832f8210 100644 --- a/docs/pom.md +++ b/docs/pom.md @@ -80,7 +80,7 @@ package_distribution': [{'result': {'value': 'http://127.0.0.1/websvn/my-project {'value': 'org.apache.maven.maven-model', 'name': 'maven-model', 'version': '3.9.0', - 'type': 'Software_application'}, + 'type': 'SoftwareDependency'}, ``` diff --git a/docs/publiccode.md b/docs/publiccode.md index 3dccbc65..f938a699 100644 --- a/docs/publiccode.md +++ b/docs/publiccode.md @@ -127,7 +127,7 @@ dependsOn: "value": "PostgreSQL>=14.0", "name": "PostgreSQL", "version": ">=14.0", - "type": "Software_application" + "type": "SoftwareDependency" }, ``` @@ -136,7 +136,7 @@ dependsOn: "value": "PostgreSQL>=14.0", "name": "PostgreSQL", "version": ">=14.0", - "type": "Software_application", + "type": "SoftwareDependency", "dependency_type": "runtime", "dependency_resolver": "pucliccode" }, diff --git a/docs/supported_metadata_files.md b/docs/supported_metadata_files.md index 3d74bc73..a035b2ae 100644 --- a/docs/supported_metadata_files.md +++ b/docs/supported_metadata_files.md @@ -43,7 +43,7 @@ SOMEF can extract metadata from a wide range of files commonly found in software | Keywords | keywords | | License | license | | Release | version | -| Software_application | requirements | +| SoftwareDependency | requirements | | String | description | | String | name | | String | package_id | @@ -86,7 +86,7 @@ The following Python code snippet show the logic used by the SOMEF parser to tra "value": f'{dependency.get("groupId", "")}.{dependency.get("artifactId", "")}'.strip("."), "name": dependency.get("artifactId", ""), "version": dependency.get("version", ""), - "type": constants.SOFTWARE_APPLICATION + "type": constants.SOFTWARE_DEPENDENCY }, 1, constants.TECHNIQUE_CODE_CONFIG_PARSER, @@ -105,7 +105,7 @@ After applying the mapping logic, the metadata for the dependency is stored unde "value": "org.apache.maven.maven-model", "name": "maven-model", "version": "3.9.0", - "type": "Software_application" + "type": "SoftwareDependency" }, "confidence": 1, "technique": "code_parser", diff --git a/poetry.lock b/poetry.lock index be43bff3..1f462f70 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1427,103 +1427,103 @@ xml = ["lxml (>=4.9.2)"] [[package]] name = "pillow" -version = "12.1.1" +version = "12.2.0" description = "Python Imaging Library (fork)" optional = false python-versions = ">=3.10" groups = ["main"] files = [ - {file = "pillow-12.1.1-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1f1625b72740fdda5d77b4def688eb8fd6490975d06b909fd19f13f391e077e0"}, - {file = "pillow-12.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:178aa072084bd88ec759052feca8e56cbb14a60b39322b99a049e58090479713"}, - {file = "pillow-12.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b66e95d05ba806247aaa1561f080abc7975daf715c30780ff92a20e4ec546e1b"}, - {file = "pillow-12.1.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:89c7e895002bbe49cdc5426150377cbbc04767d7547ed145473f496dfa40408b"}, - {file = "pillow-12.1.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a5cbdcddad0af3da87cb16b60d23648bc3b51967eb07223e9fed77a82b457c4"}, - {file = "pillow-12.1.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9f51079765661884a486727f0729d29054242f74b46186026582b4e4769918e4"}, - {file = "pillow-12.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:99c1506ea77c11531d75e3a412832a13a71c7ebc8192ab9e4b2e355555920e3e"}, - {file = "pillow-12.1.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:36341d06738a9f66c8287cf8b876d24b18db9bd8740fa0672c74e259ad408cff"}, - {file = "pillow-12.1.1-cp310-cp310-win32.whl", hash = "sha256:6c52f062424c523d6c4db85518774cc3d50f5539dd6eed32b8f6229b26f24d40"}, - {file = "pillow-12.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:c6008de247150668a705a6338156efb92334113421ceecf7438a12c9a12dab23"}, - {file = "pillow-12.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:1a9b0ee305220b392e1124a764ee4265bd063e54a751a6b62eff69992f457fa9"}, - {file = "pillow-12.1.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:e879bb6cd5c73848ef3b2b48b8af9ff08c5b71ecda8048b7dd22d8a33f60be32"}, - {file = "pillow-12.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:365b10bb9417dd4498c0e3b128018c4a624dc11c7b97d8cc54effe3b096f4c38"}, - {file = "pillow-12.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d4ce8e329c93845720cd2014659ca67eac35f6433fd3050393d85f3ecef0dad5"}, - {file = "pillow-12.1.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc354a04072b765eccf2204f588a7a532c9511e8b9c7f900e1b64e3e33487090"}, - {file = "pillow-12.1.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7e7976bf1910a8116b523b9f9f58bf410f3e8aa330cd9a2bb2953f9266ab49af"}, - {file = "pillow-12.1.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:597bd9c8419bc7c6af5604e55847789b69123bbe25d65cc6ad3012b4f3c98d8b"}, - {file = "pillow-12.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2c1fc0f2ca5f96a3c8407e41cca26a16e46b21060fe6d5b099d2cb01412222f5"}, - {file = "pillow-12.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:578510d88c6229d735855e1f278aa305270438d36a05031dfaae5067cc8eb04d"}, - {file = "pillow-12.1.1-cp311-cp311-win32.whl", hash = "sha256:7311c0a0dcadb89b36b7025dfd8326ecfa36964e29913074d47382706e516a7c"}, - {file = "pillow-12.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:fbfa2a7c10cc2623f412753cddf391c7f971c52ca40a3f65dc5039b2939e8563"}, - {file = "pillow-12.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:b81b5e3511211631b3f672a595e3221252c90af017e399056d0faabb9538aa80"}, - {file = "pillow-12.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ab323b787d6e18b3d91a72fc99b1a2c28651e4358749842b8f8dfacd28ef2052"}, - {file = "pillow-12.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:adebb5bee0f0af4909c30db0d890c773d1a92ffe83da908e2e9e720f8edf3984"}, - {file = "pillow-12.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bb66b7cc26f50977108790e2456b7921e773f23db5630261102233eb355a3b79"}, - {file = "pillow-12.1.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:aee2810642b2898bb187ced9b349e95d2a7272930796e022efaf12e99dccd293"}, - {file = "pillow-12.1.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a0b1cd6232e2b618adcc54d9882e4e662a089d5768cd188f7c245b4c8c44a397"}, - {file = "pillow-12.1.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7aac39bcf8d4770d089588a2e1dd111cbaa42df5a94be3114222057d68336bd0"}, - {file = "pillow-12.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ab174cd7d29a62dd139c44bf74b698039328f45cb03b4596c43473a46656b2f3"}, - {file = "pillow-12.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:339ffdcb7cbeaa08221cd401d517d4b1fe7a9ed5d400e4a8039719238620ca35"}, - {file = "pillow-12.1.1-cp312-cp312-win32.whl", hash = "sha256:5d1f9575a12bed9e9eedd9a4972834b08c97a352bd17955ccdebfeca5913fa0a"}, - {file = "pillow-12.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:21329ec8c96c6e979cd0dfd29406c40c1d52521a90544463057d2aaa937d66a6"}, - {file = "pillow-12.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:af9a332e572978f0218686636610555ae3defd1633597be015ed50289a03c523"}, - {file = "pillow-12.1.1-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:d242e8ac078781f1de88bf823d70c1a9b3c7950a44cdf4b7c012e22ccbcd8e4e"}, - {file = "pillow-12.1.1-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:02f84dfad02693676692746df05b89cf25597560db2857363a208e393429f5e9"}, - {file = "pillow-12.1.1-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:e65498daf4b583091ccbb2556c7000abf0f3349fcd57ef7adc9a84a394ed29f6"}, - {file = "pillow-12.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6c6db3b84c87d48d0088943bf33440e0c42370b99b1c2a7989216f7b42eede60"}, - {file = "pillow-12.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8b7e5304e34942bf62e15184219a7b5ad4ff7f3bb5cca4d984f37df1a0e1aee2"}, - {file = "pillow-12.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:18e5bddd742a44b7e6b1e773ab5db102bd7a94c32555ba656e76d319d19c3850"}, - {file = "pillow-12.1.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc44ef1f3de4f45b50ccf9136999d71abb99dca7706bc75d222ed350b9fd2289"}, - {file = "pillow-12.1.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a8eb7ed8d4198bccbd07058416eeec51686b498e784eda166395a23eb99138e"}, - {file = "pillow-12.1.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:47b94983da0c642de92ced1702c5b6c292a84bd3a8e1d1702ff923f183594717"}, - {file = "pillow-12.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:518a48c2aab7ce596d3bf79d0e275661b846e86e4d0e7dec34712c30fe07f02a"}, - {file = "pillow-12.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a550ae29b95c6dc13cf69e2c9dc5747f814c54eeb2e32d683e5e93af56caa029"}, - {file = "pillow-12.1.1-cp313-cp313-win32.whl", hash = "sha256:a003d7422449f6d1e3a34e3dd4110c22148336918ddbfc6a32581cd54b2e0b2b"}, - {file = "pillow-12.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:344cf1e3dab3be4b1fa08e449323d98a2a3f819ad20f4b22e77a0ede31f0faa1"}, - {file = "pillow-12.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:5c0dd1636633e7e6a0afe7bf6a51a14992b7f8e60de5789018ebbdfae55b040a"}, - {file = "pillow-12.1.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0330d233c1a0ead844fc097a7d16c0abff4c12e856c0b325f231820fee1f39da"}, - {file = "pillow-12.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5dae5f21afb91322f2ff791895ddd8889e5e947ff59f71b46041c8ce6db790bc"}, - {file = "pillow-12.1.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2e0c664be47252947d870ac0d327fea7e63985a08794758aa8af5b6cb6ec0c9c"}, - {file = "pillow-12.1.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:691ab2ac363b8217f7d31b3497108fb1f50faab2f75dfb03284ec2f217e87bf8"}, - {file = "pillow-12.1.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e9e8064fb1cc019296958595f6db671fba95209e3ceb0c4734c9baf97de04b20"}, - {file = "pillow-12.1.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:472a8d7ded663e6162dafdf20015c486a7009483ca671cece7a9279b512fcb13"}, - {file = "pillow-12.1.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:89b54027a766529136a06cfebeecb3a04900397a3590fd252160b888479517bf"}, - {file = "pillow-12.1.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:86172b0831b82ce4f7877f280055892b31179e1576aa00d0df3bb1bbf8c3e524"}, - {file = "pillow-12.1.1-cp313-cp313t-win32.whl", hash = "sha256:44ce27545b6efcf0fdbdceb31c9a5bdea9333e664cda58a7e674bb74608b3986"}, - {file = "pillow-12.1.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a285e3eb7a5a45a2ff504e31f4a8d1b12ef62e84e5411c6804a42197c1cf586c"}, - {file = "pillow-12.1.1-cp313-cp313t-win_arm64.whl", hash = "sha256:cc7d296b5ea4d29e6570dabeaed58d31c3fea35a633a69679fb03d7664f43fb3"}, - {file = "pillow-12.1.1-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:417423db963cb4be8bac3fc1204fe61610f6abeed1580a7a2cbb2fbda20f12af"}, - {file = "pillow-12.1.1-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:b957b71c6b2387610f556a7eb0828afbe40b4a98036fc0d2acfa5a44a0c2036f"}, - {file = "pillow-12.1.1-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:097690ba1f2efdeb165a20469d59d8bb03c55fb6621eb2041a060ae8ea3e9642"}, - {file = "pillow-12.1.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:2815a87ab27848db0321fb78c7f0b2c8649dee134b7f2b80c6a45c6831d75ccd"}, - {file = "pillow-12.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f7ed2c6543bad5a7d5530eb9e78c53132f93dfa44a28492db88b41cdab885202"}, - {file = "pillow-12.1.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:652a2c9ccfb556235b2b501a3a7cf3742148cd22e04b5625c5fe057ea3e3191f"}, - {file = "pillow-12.1.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d6e4571eedf43af33d0fc233a382a76e849badbccdf1ac438841308652a08e1f"}, - {file = "pillow-12.1.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b574c51cf7d5d62e9be37ba446224b59a2da26dc4c1bb2ecbe936a4fb1a7cb7f"}, - {file = "pillow-12.1.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a37691702ed687799de29a518d63d4682d9016932db66d4e90c345831b02fb4e"}, - {file = "pillow-12.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f95c00d5d6700b2b890479664a06e754974848afaae5e21beb4d83c106923fd0"}, - {file = "pillow-12.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:559b38da23606e68681337ad74622c4dbba02254fc9cb4488a305dd5975c7eeb"}, - {file = "pillow-12.1.1-cp314-cp314-win32.whl", hash = "sha256:03edcc34d688572014ff223c125a3f77fb08091e4607e7745002fc214070b35f"}, - {file = "pillow-12.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:50480dcd74fa63b8e78235957d302d98d98d82ccbfac4c7e12108ba9ecbdba15"}, - {file = "pillow-12.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:5cb1785d97b0c3d1d1a16bc1d710c4a0049daefc4935f3a8f31f827f4d3d2e7f"}, - {file = "pillow-12.1.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:1f90cff8aa76835cba5769f0b3121a22bd4eb9e6884cfe338216e557a9a548b8"}, - {file = "pillow-12.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1f1be78ce9466a7ee64bfda57bdba0f7cc499d9794d518b854816c41bf0aa4e9"}, - {file = "pillow-12.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:42fc1f4677106188ad9a55562bbade416f8b55456f522430fadab3cef7cd4e60"}, - {file = "pillow-12.1.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:98edb152429ab62a1818039744d8fbb3ccab98a7c29fc3d5fcef158f3f1f68b7"}, - {file = "pillow-12.1.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d470ab1178551dd17fdba0fef463359c41aaa613cdcd7ff8373f54be629f9f8f"}, - {file = "pillow-12.1.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6408a7b064595afcab0a49393a413732a35788f2a5092fdc6266952ed67de586"}, - {file = "pillow-12.1.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5d8c41325b382c07799a3682c1c258469ea2ff97103c53717b7893862d0c98ce"}, - {file = "pillow-12.1.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c7697918b5be27424e9ce568193efd13d925c4481dd364e43f5dff72d33e10f8"}, - {file = "pillow-12.1.1-cp314-cp314t-win32.whl", hash = "sha256:d2912fd8114fc5545aa3a4b5576512f64c55a03f3ebcca4c10194d593d43ea36"}, - {file = "pillow-12.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:4ceb838d4bd9dab43e06c363cab2eebf63846d6a4aeaea283bbdfd8f1a8ed58b"}, - {file = "pillow-12.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:7b03048319bfc6170e93bd60728a1af51d3dd7704935feb228c4d4faab35d334"}, - {file = "pillow-12.1.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:600fd103672b925fe62ed08e0d874ea34d692474df6f4bf7ebe148b30f89f39f"}, - {file = "pillow-12.1.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:665e1b916b043cef294bc54d47bf02d87e13f769bc4bc5fa225a24b3a6c5aca9"}, - {file = "pillow-12.1.1-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:495c302af3aad1ca67420ddd5c7bd480c8867ad173528767d906428057a11f0e"}, - {file = "pillow-12.1.1-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8fd420ef0c52c88b5a035a0886f367748c72147b2b8f384c9d12656678dfdfa9"}, - {file = "pillow-12.1.1-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f975aa7ef9684ce7e2c18a3aa8f8e2106ce1e46b94ab713d156b2898811651d3"}, - {file = "pillow-12.1.1-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8089c852a56c2966cf18835db62d9b34fef7ba74c726ad943928d494fa7f4735"}, - {file = "pillow-12.1.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:cb9bb857b2d057c6dfc72ac5f3b44836924ba15721882ef103cecb40d002d80e"}, - {file = "pillow-12.1.1.tar.gz", hash = "sha256:9ad8fa5937ab05218e2b6a4cff30295ad35afd2f83ac592e68c0d871bb0fdbc4"}, + {file = "pillow-12.2.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:a4e8f36e677d3336f35089648c8955c51c6d386a13cf6ee9c189c5f5bd713a9f"}, + {file = "pillow-12.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e589959f10d9824d39b350472b92f0ce3b443c0a3442ebf41c40cb8361c5b97"}, + {file = "pillow-12.2.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a52edc8bfff4429aaabdf4d9ee0daadbbf8562364f940937b941f87a4290f5ff"}, + {file = "pillow-12.2.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:975385f4776fafde056abb318f612ef6285b10a1f12b8570f3647ad0d74b48ec"}, + {file = "pillow-12.2.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bd9c0c7a0c681a347b3194c500cb1e6ca9cab053ea4d82a5cf45b6b754560136"}, + {file = "pillow-12.2.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:88d387ff40b3ff7c274947ed3125dedf5262ec6919d83946753b5f3d7c67ea4c"}, + {file = "pillow-12.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:51c4167c34b0d8ba05b547a3bb23578d0ba17b80a5593f93bd8ecb123dd336a3"}, + {file = "pillow-12.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:34c0d99ecccea270c04882cb3b86e7b57296079c9a4aff88cb3b33563d95afaa"}, + {file = "pillow-12.2.0-cp310-cp310-win32.whl", hash = "sha256:b85f66ae9eb53e860a873b858b789217ba505e5e405a24b85c0464822fe88032"}, + {file = "pillow-12.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:673aa32138f3e7531ccdbca7b3901dba9b70940a19ccecc6a37c77d5fdeb05b5"}, + {file = "pillow-12.2.0-cp310-cp310-win_arm64.whl", hash = "sha256:3e080565d8d7c671db5802eedfb438e5565ffa40115216eabb8cd52d0ecce024"}, + {file = "pillow-12.2.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:8be29e59487a79f173507c30ddf57e733a357f67881430449bb32614075a40ab"}, + {file = "pillow-12.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:71cde9a1e1551df7d34a25462fc60325e8a11a82cc2e2f54578e5e9a1e153d65"}, + {file = "pillow-12.2.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f490f9368b6fc026f021db16d7ec2fbf7d89e2edb42e8ec09d2c60505f5729c7"}, + {file = "pillow-12.2.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8bd7903a5f2a4545f6fd5935c90058b89d30045568985a71c79f5fd6edf9b91e"}, + {file = "pillow-12.2.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3997232e10d2920a68d25191392e3a4487d8183039e1c74c2297f00ed1c50705"}, + {file = "pillow-12.2.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e74473c875d78b8e9d5da2a70f7099549f9eb37ded4e2f6a463e60125bccd176"}, + {file = "pillow-12.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:56a3f9c60a13133a98ecff6197af34d7824de9b7b38c3654861a725c970c197b"}, + {file = "pillow-12.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:90e6f81de50ad6b534cab6e5aef77ff6e37722b2f5d908686f4a5c9eba17a909"}, + {file = "pillow-12.2.0-cp311-cp311-win32.whl", hash = "sha256:8c984051042858021a54926eb597d6ee3012393ce9c181814115df4c60b9a808"}, + {file = "pillow-12.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e6b2a0c538fc200b38ff9eb6628228b77908c319a005815f2dde585a0664b60"}, + {file = "pillow-12.2.0-cp311-cp311-win_arm64.whl", hash = "sha256:9a8a34cc89c67a65ea7437ce257cea81a9dad65b29805f3ecee8c8fe8ff25ffe"}, + {file = "pillow-12.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2d192a155bbcec180f8564f693e6fd9bccff5a7af9b32e2e4bf8c9c69dbad6b5"}, + {file = "pillow-12.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f3f40b3c5a968281fd507d519e444c35f0ff171237f4fdde090dd60699458421"}, + {file = "pillow-12.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:03e7e372d5240cc23e9f07deca4d775c0817bffc641b01e9c3af208dbd300987"}, + {file = "pillow-12.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b86024e52a1b269467a802258c25521e6d742349d760728092e1bc2d135b4d76"}, + {file = "pillow-12.2.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7371b48c4fa448d20d2714c9a1f775a81155050d383333e0a6c15b1123dda005"}, + {file = "pillow-12.2.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62f5409336adb0663b7caa0da5c7d9e7bdbaae9ce761d34669420c2a801b2780"}, + {file = "pillow-12.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:01afa7cf67f74f09523699b4e88c73fb55c13346d212a59a2db1f86b0a63e8c5"}, + {file = "pillow-12.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc3d34d4a8fbec3e88a79b92e5465e0f9b842b628675850d860b8bd300b159f5"}, + {file = "pillow-12.2.0-cp312-cp312-win32.whl", hash = "sha256:58f62cc0f00fd29e64b29f4fd923ffdb3859c9f9e6105bfc37ba1d08994e8940"}, + {file = "pillow-12.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:7f84204dee22a783350679a0333981df803dac21a0190d706a50475e361c93f5"}, + {file = "pillow-12.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:af73337013e0b3b46f175e79492d96845b16126ddf79c438d7ea7ff27783a414"}, + {file = "pillow-12.2.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:8297651f5b5679c19968abefd6bb84d95fe30ef712eb1b2d9b2d31ca61267f4c"}, + {file = "pillow-12.2.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:50d8520da2a6ce0af445fa6d648c4273c3eeefbc32d7ce049f22e8b5c3daecc2"}, + {file = "pillow-12.2.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:766cef22385fa1091258ad7e6216792b156dc16d8d3fa607e7545b2b72061f1c"}, + {file = "pillow-12.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5d2fd0fa6b5d9d1de415060363433f28da8b1526c1c129020435e186794b3795"}, + {file = "pillow-12.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56b25336f502b6ed02e889f4ece894a72612fe885889a6e8c4c80239ff6e5f5f"}, + {file = "pillow-12.2.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f1c943e96e85df3d3478f7b691f229887e143f81fedab9b20205349ab04d73ed"}, + {file = "pillow-12.2.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:03f6fab9219220f041c74aeaa2939ff0062bd5c364ba9ce037197f4c6d498cd9"}, + {file = "pillow-12.2.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5cdfebd752ec52bf5bb4e35d9c64b40826bc5b40a13df7c3cda20a2c03a0f5ed"}, + {file = "pillow-12.2.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eedf4b74eda2b5a4b2b2fb4c006d6295df3bf29e459e198c90ea48e130dc75c3"}, + {file = "pillow-12.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:00a2865911330191c0b818c59103b58a5e697cae67042366970a6b6f1b20b7f9"}, + {file = "pillow-12.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1e1757442ed87f4912397c6d35a0db6a7b52592156014706f17658ff58bbf795"}, + {file = "pillow-12.2.0-cp313-cp313-win32.whl", hash = "sha256:144748b3af2d1b358d41286056d0003f47cb339b8c43a9ea42f5fea4d8c66b6e"}, + {file = "pillow-12.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:390ede346628ccc626e5730107cde16c42d3836b89662a115a921f28440e6a3b"}, + {file = "pillow-12.2.0-cp313-cp313-win_arm64.whl", hash = "sha256:8023abc91fba39036dbce14a7d6535632f99c0b857807cbbbf21ecc9f4717f06"}, + {file = "pillow-12.2.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:042db20a421b9bafecc4b84a8b6e444686bd9d836c7fd24542db3e7df7baad9b"}, + {file = "pillow-12.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:dd025009355c926a84a612fecf58bb315a3f6814b17ead51a8e48d3823d9087f"}, + {file = "pillow-12.2.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:88ddbc66737e277852913bd1e07c150cc7bb124539f94c4e2df5344494e0a612"}, + {file = "pillow-12.2.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d362d1878f00c142b7e1a16e6e5e780f02be8195123f164edf7eddd911eefe7c"}, + {file = "pillow-12.2.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2c727a6d53cb0018aadd8018c2b938376af27914a68a492f59dfcaca650d5eea"}, + {file = "pillow-12.2.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:efd8c21c98c5cc60653bcb311bef2ce0401642b7ce9d09e03a7da87c878289d4"}, + {file = "pillow-12.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9f08483a632889536b8139663db60f6724bfcb443c96f1b18855860d7d5c0fd4"}, + {file = "pillow-12.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dac8d77255a37e81a2efcbd1fc05f1c15ee82200e6c240d7e127e25e365c39ea"}, + {file = "pillow-12.2.0-cp313-cp313t-win32.whl", hash = "sha256:ee3120ae9dff32f121610bb08e4313be87e03efeadfc6c0d18f89127e24d0c24"}, + {file = "pillow-12.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:325ca0528c6788d2a6c3d40e3568639398137346c3d6e66bb61db96b96511c98"}, + {file = "pillow-12.2.0-cp313-cp313t-win_arm64.whl", hash = "sha256:2e5a76d03a6c6dcef67edabda7a52494afa4035021a79c8558e14af25313d453"}, + {file = "pillow-12.2.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:3adc9215e8be0448ed6e814966ecf3d9952f0ea40eb14e89a102b87f450660d8"}, + {file = "pillow-12.2.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:6a9adfc6d24b10f89588096364cc726174118c62130c817c2837c60cf08a392b"}, + {file = "pillow-12.2.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:6a6e67ea2e6feda684ed370f9a1c52e7a243631c025ba42149a2cc5934dec295"}, + {file = "pillow-12.2.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:2bb4a8d594eacdfc59d9e5ad972aa8afdd48d584ffd5f13a937a664c3e7db0ed"}, + {file = "pillow-12.2.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:80b2da48193b2f33ed0c32c38140f9d3186583ce7d516526d462645fd98660ae"}, + {file = "pillow-12.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22db17c68434de69d8ecfc2fe821569195c0c373b25cccb9cbdacf2c6e53c601"}, + {file = "pillow-12.2.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7b14cc0106cd9aecda615dd6903840a058b4700fcb817687d0ee4fc8b6e389be"}, + {file = "pillow-12.2.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cbeb542b2ebc6fcdacabf8aca8c1a97c9b3ad3927d46b8723f9d4f033288a0f"}, + {file = "pillow-12.2.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4bfd07bc812fbd20395212969e41931001fd59eb55a60658b0e5710872e95286"}, + {file = "pillow-12.2.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9aba9a17b623ef750a4d11b742cbafffeb48a869821252b30ee21b5e91392c50"}, + {file = "pillow-12.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:deede7c263feb25dba4e82ea23058a235dcc2fe1f6021025dc71f2b618e26104"}, + {file = "pillow-12.2.0-cp314-cp314-win32.whl", hash = "sha256:632ff19b2778e43162304d50da0181ce24ac5bb8180122cbe1bf4673428328c7"}, + {file = "pillow-12.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:4e6c62e9d237e9b65fac06857d511e90d8461a32adcc1b9065ea0c0fa3a28150"}, + {file = "pillow-12.2.0-cp314-cp314-win_arm64.whl", hash = "sha256:b1c1fbd8a5a1af3412a0810d060a78b5136ec0836c8a4ef9aa11807f2a22f4e1"}, + {file = "pillow-12.2.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:57850958fe9c751670e49b2cecf6294acc99e562531f4bd317fa5ddee2068463"}, + {file = "pillow-12.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d5d38f1411c0ed9f97bcb49b7bd59b6b7c314e0e27420e34d99d844b9ce3b6f3"}, + {file = "pillow-12.2.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5c0a9f29ca8e79f09de89293f82fc9b0270bb4af1d58bc98f540cc4aedf03166"}, + {file = "pillow-12.2.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1610dd6c61621ae1cf811bef44d77e149ce3f7b95afe66a4512f8c59f25d9ebe"}, + {file = "pillow-12.2.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a34329707af4f73cf1782a36cd2289c0368880654a2c11f027bcee9052d35dd"}, + {file = "pillow-12.2.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e9c4f5b3c546fa3458a29ab22646c1c6c787ea8f5ef51300e5a60300736905e"}, + {file = "pillow-12.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:fb043ee2f06b41473269765c2feae53fc2e2fbf96e5e22ca94fb5ad677856f06"}, + {file = "pillow-12.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f278f034eb75b4e8a13a54a876cc4a5ab39173d2cdd93a638e1b467fc545ac43"}, + {file = "pillow-12.2.0-cp314-cp314t-win32.whl", hash = "sha256:6bb77b2dcb06b20f9f4b4a8454caa581cd4dd0643a08bacf821216a16d9c8354"}, + {file = "pillow-12.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:6562ace0d3fb5f20ed7290f1f929cae41b25ae29528f2af1722966a0a02e2aa1"}, + {file = "pillow-12.2.0-cp314-cp314t-win_arm64.whl", hash = "sha256:aa88ccfe4e32d362816319ed727a004423aab09c5cea43c01a4b435643fa34eb"}, + {file = "pillow-12.2.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0538bd5e05efec03ae613fd89c4ce0368ecd2ba239cc25b9f9be7ed426b0af1f"}, + {file = "pillow-12.2.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:394167b21da716608eac917c60aa9b969421b5dcbbe02ae7f013e7b85811c69d"}, + {file = "pillow-12.2.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5d04bfa02cc2d23b497d1e90a0f927070043f6cbf303e738300532379a4b4e0f"}, + {file = "pillow-12.2.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0c838a5125cee37e68edec915651521191cef1e6aa336b855f495766e77a366e"}, + {file = "pillow-12.2.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a6c9fa44005fa37a91ebfc95d081e8079757d2e904b27103f4f5fa6f0bf78c0"}, + {file = "pillow-12.2.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:25373b66e0dd5905ed63fa3cae13c82fbddf3079f2c8bf15c6fb6a35586324c1"}, + {file = "pillow-12.2.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:bfa9c230d2fe991bed5318a5f119bd6780cda2915cca595393649fc118ab895e"}, + {file = "pillow-12.2.0.tar.gz", hash = "sha256:a830b1a40919539d07806aa58e1b114df53ddd43213d9c8b75847eee6c0182b5"}, ] [package.extras] @@ -1680,20 +1680,20 @@ diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pytest" -version = "8.4.2" +version = "9.0.3" description = "pytest: simple powerful testing with Python" optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" groups = ["main"] files = [ - {file = "pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79"}, - {file = "pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01"}, + {file = "pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9"}, + {file = "pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c"}, ] [package.dependencies] colorama = {version = ">=0.4", markers = "sys_platform == \"win32\""} -iniconfig = ">=1" -packaging = ">=20" +iniconfig = ">=1.0.1" +packaging = ">=22" pluggy = ">=1.5,<2" pygments = ">=2.7.2" @@ -2555,4 +2555,4 @@ scikit-learn = ["scikit-learn"] [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.13" -content-hash = "6697ef1fac9e13c0441b975d31d062222945bb8a35ce63d4d4d14de76951dbb5" +content-hash = "f6aa543516ad128abc176516a873e509c5ddfc3121c2156e8caf03d9ab683ee4" diff --git a/pyproject.toml b/pyproject.toml index 4a46bdb7..9958fe65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ homepage = "https://github.com/KnowledgeCaptureAndDiscovery/somef" contractions = "^0.1.73" chardet = "^5.2.0" imbalanced-learn = "^0.12.0" - pytest = "^8.0.0" + pytest = "^9.0.0" morph-kgc = "^2.7.0" bibtexparser = "^1.4.1" nbformat = "^5.9.2" diff --git a/src/somef/extract_software_type.py b/src/somef/extract_software_type.py index 1d21f5bd..a9ad9600 100644 --- a/src/somef/extract_software_type.py +++ b/src/somef/extract_software_type.py @@ -16,7 +16,7 @@ def check_repository_type(path_repo, title, metadata_result: Result): output depending on the software type or if the repository is not considered software""" if check_static_websites(path_repo, metadata_result): - metadata_result.add_result(constants.CAT_TYPE, + metadata_result.add_result(constants.CAT_APPLICATION_TYPE, { constants.PROP_VALUE: 'static-website', constants.PROP_TYPE: constants.STRING @@ -24,7 +24,7 @@ def check_repository_type(path_repo, title, metadata_result: Result): 1, constants.TECHNIQUE_HEURISTICS) elif check_ontologies(path_repo): - metadata_result.add_result(constants.CAT_TYPE, + metadata_result.add_result(constants.CAT_APPLICATION_TYPE, { constants.PROP_VALUE: 'ontology', constants.PROP_TYPE: constants.STRING @@ -32,7 +32,7 @@ def check_repository_type(path_repo, title, metadata_result: Result): 1, constants.TECHNIQUE_HEURISTICS) elif check_notebooks(path_repo): - metadata_result.add_result(constants.CAT_TYPE, + metadata_result.add_result(constants.CAT_APPLICATION_TYPE, { constants.PROP_VALUE: 'notebook-application', constants.PROP_TYPE: constants.STRING @@ -51,7 +51,7 @@ def check_repository_type(path_repo, title, metadata_result: Result): elif check_command_line(path_repo): """The 0.82 confidence result is from running the analysis on 300 repos and showing the precision of the heuristic""" - metadata_result.add_result(constants.CAT_TYPE, + metadata_result.add_result(constants.CAT_APPLICATION_TYPE, { constants.PROP_VALUE: 'commandline-application', constants.PROP_TYPE: constants.STRING @@ -60,7 +60,7 @@ def check_repository_type(path_repo, title, metadata_result: Result): constants.TECHNIQUE_HEURISTICS) elif check_extras(path_repo): - metadata_result.add_result(constants.CAT_TYPE, + metadata_result.add_result(constants.CAT_APPLICATION_TYPE, { constants.PROP_VALUE: 'non-software', constants.PROP_TYPE: constants.STRING diff --git a/src/somef/parser/bower_parser.py b/src/somef/parser/bower_parser.py index 85d6f0e3..6ba4046f 100644 --- a/src/somef/parser/bower_parser.py +++ b/src/somef/parser/bower_parser.py @@ -138,7 +138,7 @@ def parse_bower_json_file(file_path, metadata_result: Result, source): "value": req, "name": name, "version": version, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": constants.DEPENDENCY_TYPE_RUNTIME, "dependency_resolver": "bower" }, @@ -157,7 +157,7 @@ def parse_bower_json_file(file_path, metadata_result: Result, source): "value": req, "name": name, "version": version, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": constants.DEPENDENCY_TYPE_RUNTIME, "dependency_resolver": "bower" }, diff --git a/src/somef/parser/cabal_parser.py b/src/somef/parser/cabal_parser.py index 5941a594..cbdfe2cd 100644 --- a/src/somef/parser/cabal_parser.py +++ b/src/somef/parser/cabal_parser.py @@ -207,7 +207,7 @@ def parse_cabal_file(file_path, metadata_result: Result, source): "value": req, "name": name, "version": version_constraint, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": constants.DEPENDENCY_TYPE_RUNTIME, "dependency_resolver": "cabal" }, diff --git a/src/somef/parser/codemeta_parser.py b/src/somef/parser/codemeta_parser.py index 11d33189..d3119409 100644 --- a/src/somef/parser/codemeta_parser.py +++ b/src/somef/parser/codemeta_parser.py @@ -731,7 +731,7 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source): # "version": requirement.get("version"), **({"name": requirement["name"]} if "name" in requirement else {}), **({"version": requirement["version"]} if "version" in requirement else {}), - "type": constants.SOFTWARE_APPLICATION + "type": constants.SOFTWARE_DEPENDENCY }, 1, constants.TECHNIQUE_CODE_CONFIG_PARSER, diff --git a/src/somef/parser/composer_parser.py b/src/somef/parser/composer_parser.py index 9908abce..32281155 100644 --- a/src/somef/parser/composer_parser.py +++ b/src/somef/parser/composer_parser.py @@ -169,7 +169,7 @@ def parse_composer_json(file_path, metadata_result: Result, source): "value": req, "name": name, "version": version, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": dep_type, "dependency_resolver": "composer" }, diff --git a/src/somef/parser/conda_environment_parser.py b/src/somef/parser/conda_environment_parser.py index b1c25e38..adbdcac2 100644 --- a/src/somef/parser/conda_environment_parser.py +++ b/src/somef/parser/conda_environment_parser.py @@ -49,7 +49,7 @@ def parse_conda_environment_file(file_path, metadata_result: Result, source): dep_dict = { constants.PROP_VALUE: dep, constants.PROP_NAME: re.split(r"[=<>!]", dep)[0], - constants.PROP_TYPE: constants.SOFTWARE_APPLICATION, + constants.PROP_TYPE: constants.SOFTWARE_DEPENDENCY, constants.PROP_DEPENDENCY_TYPE: constants.DEPENDENCY_TYPE_RUNTIME, constants.PROP_DEPENDENCY_RESOLVER: "conda" } @@ -71,7 +71,7 @@ def parse_conda_environment_file(file_path, metadata_result: Result, source): dep_dict = { constants.PROP_VALUE: dep, constants.PROP_NAME: re.split(r"[=<>!~]", dep)[0], - constants.PROP_TYPE: constants.SOFTWARE_APPLICATION, + constants.PROP_TYPE: constants.SOFTWARE_DEPENDENCY, constants.PROP_DEPENDENCY_TYPE: constants.DEPENDENCY_TYPE_RUNTIME, constants.PROP_DEPENDENCY_RESOLVER: "pip" } diff --git a/src/somef/parser/description_parser.py b/src/somef/parser/description_parser.py index 6ec783b4..23854e0a 100644 --- a/src/somef/parser/description_parser.py +++ b/src/somef/parser/description_parser.py @@ -211,7 +211,7 @@ def parse_description_file(file_path, metadata_result: Result, source): "value": req, "name": name, "version": version, - "type": constants.SOFTWARE_APPLICATION + "type": constants.SOFTWARE_DEPENDENCY }, 1, constants.TECHNIQUE_CODE_CONFIG_PARSER, diff --git a/src/somef/parser/gemspec_parser.py b/src/somef/parser/gemspec_parser.py index fa213bbc..2adffaa3 100644 --- a/src/somef/parser/gemspec_parser.py +++ b/src/somef/parser/gemspec_parser.py @@ -168,7 +168,7 @@ def parse_gemspec_file(file_path, metadata_result: Result, source): constants.CAT_REQUIREMENTS, { "value": dependencies, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY }, 1, constants.TECHNIQUE_CODE_CONFIG_PARSER, @@ -188,7 +188,7 @@ def parse_gemspec_file(file_path, metadata_result: Result, source): "value": req, "name": name, "version": version, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": constants.DEPENDENCY_TYPE_RUNTIME, "dependency_resolver": "bundler" }, @@ -209,7 +209,7 @@ def parse_gemspec_file(file_path, metadata_result: Result, source): "value": req, "name": name, "version": version, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": constants.DEPENDENCY_TYPE_DEVELOPMENT, "dependency_resolver": "bundler" }, diff --git a/src/somef/parser/package_json_parser.py b/src/somef/parser/package_json_parser.py index d13ea1b1..1de2f735 100644 --- a/src/somef/parser/package_json_parser.py +++ b/src/somef/parser/package_json_parser.py @@ -179,7 +179,7 @@ def parse_package_json_file(file_path, metadata_result: Result, source): "value": req, "name": name, "version": version, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": dep_type, "dependency_resolver": "npm" }, diff --git a/src/somef/parser/pom_xml_parser.py b/src/somef/parser/pom_xml_parser.py index 68fd2603..9696bbd9 100644 --- a/src/somef/parser/pom_xml_parser.py +++ b/src/somef/parser/pom_xml_parser.py @@ -174,7 +174,7 @@ def parse_pom_file(file_path, metadata_result: Result, source): "value": f'{dependency.get("groupId", "")}.{dependency.get("artifactId", "")}'.strip("."), "name": name_d, "version": version_d, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": dep_type, "dependency_resolver": "maven" }, diff --git a/src/somef/parser/publiccode_parser.py b/src/somef/parser/publiccode_parser.py index dc028cf4..169d6438 100644 --- a/src/somef/parser/publiccode_parser.py +++ b/src/somef/parser/publiccode_parser.py @@ -231,7 +231,7 @@ def parse_publiccode_file(file_path, metadata_result: Result, source): "value": f"{name}{version_str}" if version_str else name, "name": name, "version": version, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": constants.DEPENDENCY_TYPE_RUNTIME, "dependency_resolver": "publiccode" }, diff --git a/src/somef/parser/python_parser.py b/src/somef/parser/python_parser.py index 7c062da8..0cefae2e 100644 --- a/src/somef/parser/python_parser.py +++ b/src/somef/parser/python_parser.py @@ -97,7 +97,7 @@ def parse_requirements_txt(file_path, metadata_result: Result, source): req = { "value": line, "name": name, - "type": constants.SOFTWARE_APPLICATION + "type": constants.SOFTWARE_DEPENDENCY } if version: req['version'] = version diff --git a/src/somef/parser/toml_parser.py b/src/somef/parser/toml_parser.py index 04f3e5ec..64ad254d 100644 --- a/src/somef/parser/toml_parser.py +++ b/src/somef/parser/toml_parser.py @@ -340,9 +340,10 @@ def parse_cargo_metadata(data, metadata_result, source, file_path): "value": req, "name": name, "version": version, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": dep_type, - "dependency_resolver": "cargo" }, + "dependency_resolver": "cargo" + }, 1, constants.TECHNIQUE_CODE_CONFIG_PARSER, source @@ -362,7 +363,7 @@ def parse_cargo_metadata(data, metadata_result, source, file_path): "value": req, "name": name, "version": version, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": dep_type, "dependency_resolver": "cargo" }, @@ -405,7 +406,7 @@ def parse_pyproject_metadata(data, metadata_result, source, file_path): "value": req, "name": name, "version": version, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type":constants.DEPENDENCY_TYPE_RUNTIME, "dependency_resolver": "python" }, @@ -422,7 +423,7 @@ def parse_pyproject_metadata(data, metadata_result, source, file_path): "value": req, "name": name, "version": version, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": constants.DEPENDENCY_TYPE_RUNTIME, "dependency_resolver": "python" }, @@ -444,7 +445,7 @@ def parse_pyproject_metadata(data, metadata_result, source, file_path): "value": req, "name": name, "version": version, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": constants.DEPENDENCY_TYPE_RUNTIME, "dependency_resolver": "python" }, @@ -573,7 +574,7 @@ def parse_julia_project_metadata(data, metadata_result, source): { "value": req, "name": req, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": constants.DEPENDENCY_TYPE_RUNTIME, "dependency_resolver": "julia" }, @@ -598,7 +599,7 @@ def parse_julia_project_metadata(data, metadata_result, source): { "value": req, "name": req, - "type": constants.SOFTWARE_APPLICATION, + "type": constants.SOFTWARE_DEPENDENCY, "dependency_type": constants.DEPENDENCY_TYPE_DEVELOPMENT, "dependency_resolver": "julia" }, diff --git a/src/somef/process_files.py b/src/somef/process_files.py index e467f016..06a4c566 100644 --- a/src/somef/process_files.py +++ b/src/somef/process_files.py @@ -733,7 +733,6 @@ def parse_cff_root(yaml_content, metadata_result, url): if identifiers: result[constants.PROP_IDENTIFIER] = identifiers - result[constants.PROP_PREFERRED_CITATION] = "False" result[constants.PROP_FORMAT] = "cff" return clean_nulls(result) diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py index b379a1cf..eee0dafb 100644 --- a/src/somef/test/test_JSON_export.py +++ b/src/somef/test/test_JSON_export.py @@ -615,8 +615,11 @@ def test_new_properties_citation_issue_935(self): citations = json_content.get(constants.CAT_CITATION, []) + # We omit 'is_preferred_citation: False'. + # we use just the flag is_preferred_citation: True to identify the preferred citation. software_entry = next( - (cit for cit in citations if str(cit["result"].get("is_preferred_citation")) == "False"), + (cit for cit in citations if not cit["result"].get("is_preferred_citation") and + cit["result"].get("type") == "SoftwareApplication"), None ) preferred_entry = next( diff --git a/src/somef/test/test_cli.py b/src/somef/test/test_cli.py index c2b68105..bd291013 100644 --- a/src/somef/test/test_cli.py +++ b/src/somef/test/test_cli.py @@ -961,7 +961,7 @@ def test_categorization(self): data = text_file.read() text_file.close() json_content = json.loads(data) - repo_status = json_content[constants.CAT_TYPE][0] + repo_status = json_content[constants.CAT_APPLICATION_TYPE][0] print(repo_status) repo_type = repo_status[constants.PROP_RESULT][constants.PROP_VALUE] print(repo_type) @@ -990,6 +990,6 @@ def test_redundant_files(self): data = text_file.read() text_file.close() json_content = json.loads(data) - t = json_content[constants.CAT_TYPE][0] + t = json_content[constants.CAT_APPLICATION_TYPE][0] assert t[constants.PROP_RESULT][constants.PROP_VALUE] == "ontology" os.remove(test_data_path + "test-ecfo.json") diff --git a/src/somef/test/test_codemeta_export.py b/src/somef/test/test_codemeta_export.py index b4d91d7a..dcfc63d7 100644 --- a/src/somef/test/test_codemeta_export.py +++ b/src/somef/test/test_codemeta_export.py @@ -611,7 +611,6 @@ def test_issue_886_apache_code(self): json_content = json.loads(data) copyright_holder = json_content[constants.CAT_CODEMETA_COPYRIGHTHOLDER] - print(copyright_holder) copyright_year = json_content[constants.CAT_CODEMETA_COPYRIGHTYEAR] assert copyright_holder == "Daniel Garijo, Information Sciences Institute, USC." @@ -643,7 +642,6 @@ def test_issue_936_contributors(self): json_content = json.loads(data) contributors = json_content[constants.CAT_CODEMETA_CONTRIBUTOR] - print(contributors) self.assertTrue(any( c["name"] == "Abby Cabunoc Mayes" and c.get("givenName") == "Abby Cabunoc" diff --git a/src/somef/test/test_data/expected/Widoco.yaml b/src/somef/test/test_data/expected/Widoco.yaml index a7465b3b..747483bb 100644 --- a/src/somef/test/test_data/expected/Widoco.yaml +++ b/src/somef/test/test_data/expected/Widoco.yaml @@ -23,7 +23,7 @@ CAT_DESCRIPTION: # Passed - "WIDOCO helps you to publish and create an enriched and customized documentation of your ontology, by following a series of steps in a wizard. We extend the LODE framework by Silvio Peroni to describe the classes, properties and data properties of the ontology, the OOPS! webservice by María Poveda to print an evaluation and the Licensius service by Victor Rodriguez Doncel to determine the license URI and title being used. In addition, we use WebVowl to visualize the ontology and have extended Bubastis to show a complete changelog between different versions of your ontology.\n\nFeatures of WIDOCO:\n* Automatic documentation of the terms in your ontology (based on [LODE](http://www.essepuntato.it/lode/)). Now **you can use Markdown on your class descriptions** (see [example](https://dgarijo.github.io/Widoco/doc/gallery/index.html))\n* Massive metadata extraction and support: WIDOCO will enhance your ontology documentation based on your ontology annotations. Now you can add custom logos and images, edit the content of your sections, etc. by just editing metadata. See our [supported metadata](doc/metadataGuide/guide.md) and [recommendations](https://dgarijo.github.io/Widoco/doc/bestPractices/index-en.html) for more information.\n* Automatic annotation in JSON-LD snippets of the html produced.\n* Association of a provenance page which includes the history of your vocabulary (W3C PROV-O compliant).\n* Guidelines on the main sections that your document should have and how to complete them.\n* Integration with diagram creators ([WebVOWL](http://vowl.visualdataweb.org/webvowl/)).\n* Automatic changelog of differences between the actual and the previous version of the ontology (based on [Bubastis](http://www.ebi.ac.uk/efo/bubastis/)).\n* Separation of the sections of your html page so you can write them independently and replace only those needed.\n* Content negotiation and serialization of your ontology according to [W3C best practices](https://www.w3.org/TR/swbp-vocab-pub/)\n* Evaluation reports of your ontology (using the [OOPS! web service](https://oops.linkeddata.es/))\n* Integration with license metadata services ([Licensius](http://licensius.com/)) to automatically describe the license used in your ontology.\n" # Passed CAT_CITATION: # Passed title: "WIDOCO: a wizard for documenting ontologies" - type: Scholarly_article + type: ScholarlyArticle url: http://dgarijo.com/papers/widoco-iswc2017.pdf date_published: "2017" doi: "10.1007/978-3-319-68204-4_9" diff --git a/src/somef/test/test_data/expected/gammapy.yaml b/src/somef/test/test_data/expected/gammapy.yaml index b3953f03..cff9059c 100644 --- a/src/somef/test/test_data/expected/gammapy.yaml +++ b/src/somef/test/test_data/expected/gammapy.yaml @@ -24,7 +24,7 @@ CAT_REQUIREMENTS: # Passed value: numpy>=1.21 name: numpy version: ">=1.21" - type: Software_application + type: SoftwareDependency CAT_AUTHORS: # Passed value: Fabio Acero diff --git a/src/somef/test/test_toml_parser.py b/src/somef/test/test_toml_parser.py index 5cbae680..5de55849 100644 --- a/src/somef/test/test_toml_parser.py +++ b/src/somef/test/test_toml_parser.py @@ -152,7 +152,7 @@ def test_parse_pluto_project_toml(self): self.assertIn("REPL", dep_values) for req in requirements_results: - self.assertEqual(req["result"]["type"], constants.SOFTWARE_APPLICATION) + self.assertEqual(req["result"]["type"], constants.SOFTWARE_DEPENDENCY) self.assertEqual(req["technique"], constants.TECHNIQUE_CODE_CONFIG_PARSER) runtime_results = metadata_result.results.get(constants.CAT_RUNTIME_PLATFORM, []) diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 5566b717..85461f06 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -172,7 +172,7 @@ CAT_SUPPORT_CHANNELS = "support_channels" CAT_USAGE = "usage" CAT_WORKFLOWS = "workflows" -CAT_TYPE = "type" +CAT_APPLICATION_TYPE = "application_type" # former CAT_TYPE CAT_PACKAGE_ID = "package_id" CAT_HAS_PACKAGE_FILE = "has_package_file" CAT_VERSION = "version" @@ -198,7 +198,7 @@ CAT_OWNER, CAT_PACKAGE_DISTRIBUTION, CAT_HAS_PACKAGE_FILE, CAT_PROGRAMMING_LANGUAGES, CAT_README_URL, CAT_RELATED_DOCUMENTATION, CAT_RELEASES, CAT_RUN, CAT_RUNTIME_PLATFORM, CAT_RELATED_PAPERS, CAT_STATUS, CAT_REQUIREMENTS, CAT_STARS, CAT_SUPPORT, CAT_SUPPORT_CHANNELS, CAT_USAGE, - CAT_WORKFLOWS, CAT_TYPE] + CAT_WORKFLOWS, CAT_APPLICATION_TYPE] # All properties used by SOMEF to label the output JSON # Provenance: @@ -291,10 +291,11 @@ AGENT = "Agent" RELEASE = "Release" LICENSE = "License" -PUBLICATION = "Publication" +# PUBLICATION = "Publication" LANGUAGE = "Programming_language" -SOFTWARE_APPLICATION = "Software_application" -SCHOLARLY_ARTICLE = "Scholarly_article" +SOFTWARE_APPLICATION = "SoftwareApplication" +SOFTWARE_DEPENDENCY = "SoftwareDependency" +SCHOLARLY_ARTICLE = "ScholarlyArticle" # Different techniques TECHNIQUE_SUPERVISED_CLASSIFICATION = "supervised_classification" From 20e1ef0009e7b94c1b1570cf2886d4f6d6851882 Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Wed, 15 Apr 2026 12:11:10 +0200 Subject: [PATCH 12/12] Funding and funder in codemeta export. Fixes #960 --- README.md | 1 + docs/codemetajson.md | 44 +++++++++++++++++-- docs/index.md | 1 + docs/output.md | 2 +- src/somef/export/json_export.py | 10 +++++ src/somef/parser/codemeta_parser.py | 60 +++++++++++++------------- src/somef/test/test_codemeta_export.py | 40 +++++++++++++++++ src/somef/utils/constants.py | 4 ++ 8 files changed, 127 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index cbe23ebc..e79ec4bb 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ We aim to recognize the following properties: - **Forks url**: Links to forks made of the project - **Full name**: Name + owner (owner/name) - **Full title**: If the repository is a short name, we will attempt to extract the longer version of the repository name +- **Funding**: Funding information associated with the project. **Note**: Currently, this information is only extracted from existing `codemeta.json` files within the repository. - **Identifier**: Identifier associated with the software (if any), such as Digital Object Identifiers and Software Heritage identifiers (SWH). DOIs associated with publications will also be detected. - **Images**: Images used to illustrate the software component - **Installation instructions**: A set of instructions that indicate how to install a target repository diff --git a/docs/codemetajson.md b/docs/codemetajson.md index 8a702278..aa666bc9 100644 --- a/docs/codemetajson.md +++ b/docs/codemetajson.md @@ -28,8 +28,9 @@ These fields are defined in the [Codemeta specification](https://github.com/code | development_status | development_status[i].result.value | developmentStatus | | download_url | download_url[i].result.value | downloadUrl | | has_package_file | has_package_file[i].result.value | URL of the codemeta.json file | -| funding - funder | funding[i].result.funder | funding.funder or funding.funder.name | -| funding - funding | funding[i].result.funding | String.fundingIdentifier | +| funding - funder | funding[i].result.funder | funder.@id or funder.name *(1)*| +| funding - funding | funding[i].result.funding | funding *(1)*| +| funding - value | funding[i].result.value | funding string or funder.name *(1)*| | identifier | identifier[i].result.value | identifier | | issue_tracker | issue_tracker[i].result.value | issueTracker | | keywords | keywords[i].result.value | keywords | @@ -49,4 +50,41 @@ These fields are defined in the [Codemeta specification](https://github.com/code | version | version[i].result.value | softwareVersion or version | - \ No newline at end of file +--- + +*(1)* + +- SOMEF json result: + +``` +"funding": [ + { + "result": { + "value": "1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", + "type": "String", + "funder": { + "@id": "https://doi.org/10.13039/100000001", + "@type": "Organization", + "name": "National Science Foundation" + }, + "funding": "1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software" + }, + "confidence": 1, + "technique": "code_parser", + "source": "https://raw.githubusercontent.com/.../codemeta.json" + } +] +``` + +- CODEMETA output: +``` +"funder": { + "@id": "https://doi.org/10.13039/100000001", + "@type": "Organization", + "name": "National Science Foundation" + }, +"funding": "1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", +``` + + + diff --git a/docs/index.md b/docs/index.md index 07bf45ae..74db03ab 100644 --- a/docs/index.md +++ b/docs/index.md @@ -62,6 +62,7 @@ We aim to recognize the following properties: - **Forks url**: Links to forks made of the project - **Full name**: Name + owner (owner/name) - **Full title**: If the repository is a short name, we will attempt to extract the longer version of the repository name +- **Funding**: Funding information associated with the project. **Note**: Currently, this information is only extracted from existing `codemeta.json` files within the repository. - **Homepage**: URL of the item. - **Identifier**: Identifier associated with the software (if any), such as Digital Object Identifiers and Software Heritage identifiers (SWH). DOIs associated with publications will also be detected. - **Images**: Images used to illustrate the software component diff --git a/docs/output.md b/docs/output.md index 42d055b4..43e2be94 100644 --- a/docs/output.md +++ b/docs/output.md @@ -89,7 +89,7 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `forks_url`: Links to forks made of the project (GitHub only) - `full_name`: Name + owner (owner/name) (if available) - `full_title`: If the repository has a short name, we will attempt to extract the longer version of the repository name. For example, a repository may be called "Widoco", but the longer title is "Wizard for documenting ontologies". -- `funding`: Funding code for the related project. +- `funding`: Funding code for the related project. Currently, this information is only extracted from existing `codemeta.json` files within the repository. - `has_build_file`: Build file to create a Docker image for the target software - `has_package_file`: Specifies what package file is present in the code repository. - `has_script_file`: Snippets of code contained in the repository. diff --git a/src/somef/export/json_export.py b/src/somef/export/json_export.py index 12206823..0f6faff1 100644 --- a/src/somef/export/json_export.py +++ b/src/somef/export/json_export.py @@ -587,6 +587,16 @@ def format_date(date_string): raw_contributors = repo_data[constants.CAT_CONTRIBUTORS] codemeta_output[constants.CAT_CODEMETA_CONTRIBUTOR] = parse_contributors(raw_contributors) + if constants.CAT_FUNDING in repo_data: + for funding_entry in repo_data[constants.CAT_FUNDING]: + res_fun = funding_entry[constants.PROP_RESULT] + + if constants.PROP_FUNDING in res_fun and res_fun[constants.PROP_FUNDING] != "": + codemeta_output[constants.CAT_CODEMETA_FUNDING] = res_fun[constants.PROP_FUNDING] + + if constants.PROP_FUNDER in res_fun and res_fun[constants.PROP_FUNDER] != "": + codemeta_output[constants.CAT_CODEMETA_FUNDER] = res_fun[constants.PROP_FUNDER] + # A person is expected, and we extract text at the moment if descriptions_text: codemeta_output[constants.CAT_CODEMETA_DESCRIPTION] = descriptions_text diff --git a/src/somef/parser/codemeta_parser.py b/src/somef/parser/codemeta_parser.py index d3119409..c4e9be93 100644 --- a/src/somef/parser/codemeta_parser.py +++ b/src/somef/parser/codemeta_parser.py @@ -531,37 +531,35 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source): source ) - if "funding" in data: - funding_data = data["funding"] - if isinstance(funding_data, list): - for fund in funding_data: - fund_info = parse_funding(fund) - if fund_info: - metadata_result.add_result( - constants.CAT_FUNDING, - { - "funder": fund_info.get("funder", ""), - "funding": fund_info.get("funding", ""), - "type": constants.STRING - }, - 1, - constants.TECHNIQUE_CODE_CONFIG_PARSER, - source - ) - elif isinstance(funding_data, dict): - fund_info = parse_funding(funding_data) - if fund_info: - metadata_result.add_result( - constants.CAT_FUNDING, - { - "funder": fund_info.get("funder", ""), - "funding": fund_info.get("funding", ""), - "type": constants.STRING - }, - 1, - constants.TECHNIQUE_CODE_CONFIG_PARSER, - source - ) + + funder_data = data.get("funder") + funding_data = data.get("funding") + + if funder_data or funding_data: + main_value = funding_data if funding_data else funder_data + + if isinstance(main_value, (list, dict)): + main_value = str(main_value) + + res_fund = { + "value": main_value, + "type": constants.STRING + } + + if funder_data and (not isinstance(funder_data, list) or len(funder_data) > 0): + res_fund[constants.PROP_FUNDER] = funder_data + + if funding_data and (not isinstance(funding_data, list) or len(funding_data) > 0): + res_fund[constants.PROP_FUNDING] = funding_data + + if res_fund.get("value"): + metadata_result.add_result( + constants.CAT_FUNDING, + res_fund, + 1, + constants.TECHNIQUE_CODE_CONFIG_PARSER, + source + ) if "developmentStatus" in data: metadata_result.add_result( diff --git a/src/somef/test/test_codemeta_export.py b/src/somef/test/test_codemeta_export.py index dcfc63d7..30369661 100644 --- a/src/somef/test/test_codemeta_export.py +++ b/src/somef/test/test_codemeta_export.py @@ -666,6 +666,46 @@ def test_issue_936_contributors(self): os.remove(test_data_path + "test_issue_936_contributors.json") + + def test_issue_960_funding(self): + """Checks whether funding and funder information are correctly extracted and exported to CodeMeta""" + output_path = test_data_path + "test_issue_960_funding.json" + + somef_cli.run_cli(threshold=0.8, + ignore_classifiers=False, + repo_url=None, + local_repo=test_data_repositories + "codemeta_repo", + doc_src=None, + in_file=None, + output=None, + graph_out=None, + graph_format="turtle", + codemeta_out=output_path, + pretty=True, + missing=False, + readme_only=False) + + text_file = open(output_path, "r") + data = text_file.read() + text_file.close() + json_content = json.loads(data) + + expected_funding = "1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software" + self.assertEqual(json_content.get("funding"), expected_funding, + f"Expected funding '{expected_funding}' not found in exported CodeMeta") + + funder = json_content.get("funder") + self.assertIsNotNone(funder, "Funder field missing in exported CodeMeta") + + if isinstance(funder, dict): + self.assertEqual(funder.get("name"), "National Science Foundation", "Funder name mismatch") + self.assertEqual(funder.get("@id"), "https://doi.org/10.13039/100000001", "Funder @id mismatch") + else: + self.assertEqual(funder, "National Science Foundation", "Funder name mismatch") + + os.remove(output_path) + + @classmethod def tearDownClass(cls): """delete temp file JSON just if all the test pass""" diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 85461f06..af200639 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -233,6 +233,8 @@ PROP_EMAIL = "email" PROP_GIVEN_NAME = "given_name" PROP_FAMILY_NAME = "family_name" +PROP_FUNDER = "funder" +PROP_FUNDING = "funding" PROP_HTML_URL = "html_url" PROP_IDENTIFIER = "identifier" PROP_JOURNAL = "journal" @@ -444,6 +446,8 @@ class RepositoryType(Enum): CAT_CODEMETA_DESCRIPTION = "description" CAT_CODEMETA_DEVELOPMENTSTATUS = "developmentStatus" CAT_CODEMETA_DOWNLOADURL = "downloadUrl" +CAT_CODEMETA_FUNDER = "funder" +CAT_CODEMETA_FUNDING = "funding" CAT_CODEMETA_ISSUETRACKER = "issueTracker" CAT_CODEMETA_IDENTIFIER = "identifier" CAT_CODEMETA_KEYWORDS = "keywords"