Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions eu_fact_force/exploration/data_collection/download_hal_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import xml.etree.ElementTree as ET
from typing import Optional

import requests # need install


def get_response_content(doi: str) -> Optional[ET.Element]:
"""Get the XML response from the HAL API URL for a specific DOI"""
try:
api_url = (
f"https://api.archives-ouvertes.fr/search/?q=doiId_s:{doi}&wt=xml&fl=uri_s"
)
response = requests.get(api_url, timeout=10)
response.raise_for_status()
response_content = ET.fromstring(response.content)
return response_content
except Exception as e:
print(f"Error while fetching the API : {e}")
return None


def get_pdf_url(uri: str) -> str:
"""Get the PDF URL from the URI"""
return f"{uri}/document"


def set_output_file(doi: str, output_dir: str = "pdf") -> str:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type could be Path, as it is probably more adequate than str to represent a filesystem location.

"""Set the name and the path of the PDF output file"""
doi_str = doi.replace(".", "_").replace("/", "_")
return str(output_dir + "/" + doi_str + ".pdf")


def download_pdf_from_url(pdf_url: str, output_path: str) -> bool:
"""Download the PDF from the URL"""
try:
pdf_response = requests.get(pdf_url, timeout=30)
pdf_response.raise_for_status()

with open(output_path, "wb") as f:
f.write(pdf_response.content)
print(f" PDF downloaded : {output_path}")
return True

except Exception as e:
print(f"Download fail: {e}")
return False


def get_pdf_from_doi(doi: str) -> bool:
"""Download the article's PDF for a specified DOI"""
# 1. Get the response from the API and check that the DOI is in HAL library
response_content = get_response_content(doi)
if response_content is None:
return False

numFound = int(response_content.find(".//result").get("numFound", "0"))
if numFound == 0:
print("No matching data for this DOI.")
return False

# 2. Get the URI
uri = response_content.find(".//str[@name='uri_s']").text
if not uri:
print("No uri found for this DOI")
return False

# 3. Download the PDF from the URI
pdf_url = get_pdf_url(uri)
output_path = set_output_file(doi)
return download_pdf_from_url(pdf_url, output_path)


if __name__ == "__main__":
# get_pdf_from_doi("10.1016/j.pnpbp.2024.110948")
get_pdf_from_doi("10.1038/s41467-019-10626-x")