diff --git a/Pipfile b/Pipfile index 37e781c7..0dcff622 100644 --- a/Pipfile +++ b/Pipfile @@ -8,6 +8,7 @@ cachetools = "*" # Required by google-api-python-client feedparser = "*" GitPython = "*" google-api-python-client = "*" +lxml = "*" matplotlib = "*" pandas = "*" Pygments = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 4a108dac..72c7e24d 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "f81603324232c06d700268ffd797fca2ded4eab2ad01a01e5a66a2cf54ffe49c" + "sha256": "979957e3045fb86e56b0efba22d807bbacabe855af9da48256b6ef40553b8901" }, "pipfile-spec": 6, "requires": { @@ -484,6 +484,153 @@ "markers": "python_version >= '3.10'", "version": "==1.4.9" }, + "lxml": { + "hashes": [ + "sha256:058027e261afed589eddcfe530fcc6f3402d7fd7e89bfd0532df82ebc1563dba", + "sha256:063eccf89df5b24e361b123e257e437f9e9878f425ee9aae3144c77faf6da6d8", + "sha256:064fdadaf7a21af3ed1dcaa106b854077fbeada827c18f72aec9346847cd65d0", + "sha256:08b9d5e803c2e4725ae9e8559ee880e5328ed61aa0935244e0515d7d9dbec0aa", + "sha256:0a3c150a95fbe5ac91de323aa756219ef9cf7fde5a3f00e2281e30f33fa5fa4f", + "sha256:0aa7070978f893954008ab73bb9e3c24a7c56c054e00566a21b553dc18105fca", + "sha256:13dcecc9946dca97b11b7c40d29fba63b55ab4170d3c0cf8c0c164343b9bfdcf", + "sha256:13e35cbc684aadf05d8711a5d1b5857c92e5e580efa9a0d2be197199c8def607", + "sha256:17f68764f35fd78d7c4cc4ef209a184c38b65440378013d24b8aecd327c3e0c8", + "sha256:1941354d92699fb5ffe6ed7b32f9649e43c2feb4b97205f75866f7d21aa91452", + "sha256:1c06035eafa8404b5cf475bb37a9f6088b0aca288d4ccc9d69389750d5543700", + "sha256:1db01e5cf14345628e0cbe71067204db658e2fb8e51e7f33631f5f4735fefd8d", + "sha256:1e786a464c191ca43b133906c6903a7e4d56bef376b75d97ccbb8ec5cf1f0a4b", + "sha256:1ea99340b3c729beea786f78c38f60f4795622f36e305d9c9be402201efdc3b7", + "sha256:200069a593c5e40b8f6fc0d84d86d970ba43138c3e68619ffa234bc9bb806a4d", + "sha256:2047d8234fe735ab77802ce5f2297e410ff40f5238aec569ad7c8e163d7b19a6", + "sha256:21c73b476d3cfe836be731225ec3421fa2f048d84f6df6a8e70433dff1376d5a", + "sha256:24a8e756c982c001ca8d59e87c80c4d9dcd4d9b44a4cbeb8d9be4482c514d41d", + "sha256:252a22982dca42f6155125ac76d3432e548a7625d56f5a273ee78a5057216eca", + "sha256:2593c77efde7bfea7f6389f1ab249b15ed4aa5bc5cb5131faa3b843c429fbedb", + "sha256:25fcc59afc57d527cfc78a58f40ab4c9b8fd096a9a3f964d2781ffb6eb33f4ed", + "sha256:2613e67de13d619fd283d58bda40bff0ee07739f624ffee8b13b631abf33083d", + "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", + "sha256:2c8458c2cdd29589a8367c09c8f030f1d202be673f0ca224ec18590b3b9fb694", + "sha256:2ca59e7e13e5981175b8b3e4ab84d7da57993eeff53c07764dcebda0d0e64ecd", + "sha256:2cbcbf6d6e924c28f04a43f3b6f6e272312a090f269eff68a2982e13e5d57659", + "sha256:2ed6c667fcbb8c19c6791bbf40b7268ef8ddf5a96940ba9404b9f9a304832f6c", + "sha256:358d9adae670b63e95bc59747c72f4dc97c9ec58881d4627fe0120da0f90d314", + "sha256:370cd78d5855cfbffd57c422851f7d3864e6ae72d0da615fca4dad8c45d375a5", + "sha256:3ae2ce7d6fedfb3414a2b6c5e20b249c4c607f72cb8d2bb7cc9c6ec7c6f4e849", + "sha256:3b1675e096e17c6fe9c0e8c81434f5736c0739ff9ac6123c87c2d452f48fc938", + "sha256:3e3cb08855967a20f553ff32d147e14329b3ae70ced6edc2f282b94afbc74b2a", + "sha256:3efe1b21c7801ffa29a1112fab3b0f643628c30472d507f39544fd48e9549e34", + "sha256:3fee0851639d06276e6b387f1c190eb9d7f06f7f53514e966b26bae46481ec90", + "sha256:4077b7c79f31755df33b795dc12119cb557a0106bfdab0d2c2d97bd3cf3dffa6", + "sha256:414aaa94e974e23a3e92e7ca5b97d10c0cf37b6481f50911032c69eeb3991bba", + "sha256:4197fb2534ee05fd3e7afaab5d8bfd6c2e186f65ea7f9cd6a82809c887bd1285", + "sha256:442de7530296ef5e188373a1ea5789a46ce90c4847e597856570439621d9c553", + "sha256:4468e3b83e10e0317a89a33d28f7aeba1caa4d1a6fd457d115dd4ffe90c5931d", + "sha256:452b899faa64f1805943ec1c0c9ebeaece01a1af83e130b69cdefeda180bb42c", + "sha256:45f93e6f75123f88d7f0cfd90f2d05f441b808562bf0bc01070a00f53f5028b5", + "sha256:48461bd21625458dd01e14e2c38dd0aea69addc3c4f960c30d9f59d7f93be601", + "sha256:4ddb1049fa0579d0cbd00503ad8c58b9ab34d1254c77bc6a5576d96ec7853dba", + "sha256:5179c60288204e6ddde3f774a93350177e08876eaf3ab78aa3a3649d43eb7d37", + "sha256:57a86e1ebb4020a38d295c04fc79603c7899e0df71588043eb218722dabc087f", + "sha256:5921d924aa5468c939d95c9814fa9f9b5935a6ff4e679e26aaf2951f74043512", + "sha256:59c45e125140b2c4b33920d21d83681940ca29f0b83f8629ea1a2196dc8cfe6a", + "sha256:5aa0fc67ae19d7a64c3fe725dc9a1bb11f80e01f78289d05c6f62545affec438", + "sha256:5d444858b9f07cefff6455b983aea9a67f7462ba1f6cbe4a21e8bf6791bf2153", + "sha256:60fa43be34f78bebb27812ed90f1925ec99560b0fa1decdb7d12b84d857d31e9", + "sha256:6162a86d86893d63084faaf4ff937b3daea233e3682fb4474db07395794fa80d", + "sha256:61cb10eeb95570153e0c0e554f58df92ecf5109f75eacad4a95baa709e26c3d6", + "sha256:65ac4a01aba353cfa6d5725b95d7aed6356ddc0a3cd734de00124d285b04b64f", + "sha256:65ea18d710fd14e0186c2f973dc60bb52039a275f82d3c44a0e42b43440ea534", + "sha256:6605c604e6daa9e0d7f0a2137bdc47a2e93b59c60a65466353e37f8272f47c46", + "sha256:66328dabea70b5ba7e53d94aa774b733cf66686535f3bc9250a7aab53a91caaf", + "sha256:6c8963287d7a4c5c9a432ff487c52e9c5618667179c18a204bdedb27310f022f", + "sha256:6cdaefac66e8b8f30e37a9b4768a391e1f8a16a7526d5bc77a7928408ef68e93", + "sha256:6da5185951d72e6f5352166e3da7b0dc27aa70bd1090b0eb3f7f7212b53f1bb8", + "sha256:6ddff43f702905a4e32bc24f3f2e2edfe0f8fde3277d481bffb709a4cced7a1f", + "sha256:6ec0e3f745021bfed19c456647f0298d60a24c9ff86d9d051f52b509663feeb1", + "sha256:6f91fd2b2ea15a6800c8e24418c0775a1694eefc011392da73bc6cef2623b322", + "sha256:700efd30c0fa1a3581d80a748157397559396090a51d306ea59a70020223d16f", + "sha256:71695772df6acea9f3c0e59e44ba8ac50c4f125217e84aab21074a1a55e7e5c9", + "sha256:72c87e5ee4e58a8354fb9c7c84cbf95a1c8236c127a5d1b7683f04bed8361e1f", + "sha256:7d2de809c2ee3b888b59f995625385f74629707c9355e0ff856445cdcae682b7", + "sha256:80dadc234ebc532e09be1975ff538d154a7fa61ea5031c03d25178855544728f", + "sha256:817ef43a0c0b4a77bd166dc9a09a555394105ff3374777ad41f453526e37f9cb", + "sha256:846ae9a12d54e368933b9759052d6206a9e8b250291109c48e350c1f1f49d916", + "sha256:875c6b5ab39ad5291588aed6925fac99d0097af0dd62f33c7b43736043d4a2ec", + "sha256:8799481bbdd212470d17513a54d568f44416db01250f49449647b5ab5b5dccb9", + "sha256:8ac6e5811ae2870953390452e3476694196f98d447573234592d30488147404d", + "sha256:8f8d0cbd0674ee89863a523e6994ac25fd5be9c8486acfc3e5ccea679bad2679", + "sha256:901e3b4219fa04ef766885fb40fa516a71662a4c61b80c94d25336b4934b71c0", + "sha256:90a345bbeaf9d0587a3aaffb7006aa39ccb6ff0e96a57286c0cb2fd1520ea192", + "sha256:9261bb77c2dab42f3ecd9103951aeca2c40277701eb7e912c545c1b16e0e4917", + "sha256:945da35a48d193d27c188037a05fec5492937f66fb1958c24fc761fb9d40d43c", + "sha256:957448ac63a42e2e49531b9d6c0fa449a1970dbc32467aaad46f11545be9af1d", + "sha256:967aab75434de148ec80597b75062d8123cadf2943fb4281f385141e18b21338", + "sha256:98a5e1660dc7de2200b00d53fa00bcd3c35a3608c305d45a7bbcaf29fa16e83d", + "sha256:995e783eb0374c120f528f807443ad5a83a656a8624c467ea73781fc5f8a8304", + "sha256:9b33d21594afab46f37ae58dfadd06636f154923c4e8a4d754b0127554eb2e77", + "sha256:a4bf42d2e4cf52c28cc1812d62426b9503cdb0c87a6de81442626aa7d69707ba", + "sha256:a59f5448ba2ceccd06995c95ea59a7674a10de0810f2ce90c9006f3cbc044456", + "sha256:a656ca105115f6b766bba324f23a67914d9c728dafec57638e2b92a9dcd76c62", + "sha256:a6b5b39cc7e2998f968f05309e666103b53e2edd01df8dc51b90d734c0825444", + "sha256:a7c5d5e5f1081955358533be077166ee97ed2571d6a66bdba6ec2f609a715d1a", + "sha256:a8bef9b9825fa8bc816a6e641bb67219489229ebc648be422af695f6e7a4fa7f", + "sha256:a8ffaeec5dfea5881d4c9d8913a32d10cfe3923495386106e4a24d45300ef79c", + "sha256:abd44571493973bad4598a3be7e1d807ed45aa2adaf7ab92ab7c62609569b17d", + "sha256:ac02dc29fd397608f8eb15ac1610ae2f2f0154b03f631e6d724d9e2ad4ee2c84", + "sha256:af85529ae8d2a453feee4c780d9406a5e3b17cee0dd75c18bd31adcd584debc3", + "sha256:b0c732aa23de8f8aec23f4b580d1e52905ef468afb4abeafd3fec77042abb6fe", + "sha256:b2142a376b40b6736dfc214fd2902409e9e3857eff554fed2d3c60f097e62a62", + "sha256:b22a07cbb82fea98f8a2fd814f3d1811ff9ed76d0fc6abc84eb21527596e7cc8", + "sha256:b2c3da8d93cf5db60e8858c17684c47d01fee6405e554fb55018dd85fc23b178", + "sha256:b2c7fdaa4d7c3d886a42534adec7cfac73860b89b4e5298752f60aa5984641a0", + "sha256:b30d46379644fbfc3ab81f8f82ae4de55179414651f110a1514f0b1f8f6cb2d7", + "sha256:b42f4d86b451c2f9d06ffb4f8bbc776e04df3ba070b9fe2657804b1b40277c48", + "sha256:b738f7e648735714bbb82bdfd030203360cfeab7f6e8a34772b3c8c8b820568c", + "sha256:b7fc49c37f1786284b12af63152fe1d0990722497e2d5817acfe7a877522f9a9", + "sha256:b8f18914faec94132e5b91e69d76a5c1d7b0c73e2489ea8929c4aaa10b76bbf7", + "sha256:bb2f6ca0ae2d983ded09357b84af659c954722bbf04dea98030064996d156048", + "sha256:bb4c1847b303835d89d785a18801a883436cdfd5dc3d62947f9c49e24f0f5a2c", + "sha256:bc456d04db0515ce3320d714a1eac7a97774ff0849e7718b492d957da4631dd4", + "sha256:bc532422ff26b304cfb62b328826bd995c96154ffd2bac4544f37dbb95ecaa8f", + "sha256:be3aaa60da67e6153eb15715cc2e19091af5dc75faef8b8a585aea372507384b", + "sha256:c33e66d44fe60e72397b487ee92e01da0d09ba2d66df8eae42d77b6d06e5eba0", + "sha256:c371aa98126a0d4c739ca93ceffa0fd7a5d732e3ac66a46e74339acd4d334564", + "sha256:c54d83a2188a10ebdba573f16bd97135d06c9ef60c3dc495315c7a28c80a263f", + "sha256:c7d13103045de1bdd6fe5d61802565f1a3537d70cd3abf596aa0af62761921ee", + "sha256:cb233f9c95f83707dae461b12b720c1af9c28c2d19208e1be03387222151daf5", + "sha256:cd79f3367bd74b317dda655dc8fcfa304d9eb6e4fb06b7168c5cf27f96e0cd62", + "sha256:cdcbed9ad19da81c480dfd6dd161886db6096083c9938ead313d94b30aadf272", + "sha256:d100fcc8930d697c6561156c6810ab4a508fb264c8b6779e6e61e2ed5e7558f9", + "sha256:d4aec24d6b72ee457ec665344a29acb2d35937d5192faebe429ea02633151aad", + "sha256:d6690ec5ec1cce0385cb20896b16be35247ac8c2046e493d03232f1c2414d321", + "sha256:d759cdd7f3e055d6bc8d9bec3ad905227b2e4c785dc16c372eb5b5e83123f48a", + "sha256:da08e7bb297b04e893d91087df19638dc7a6bb858a954b0cc2b9f5053c922312", + "sha256:dacf3c64ef3f7440e3167aa4b49aa9e0fb99e0aa4f9ff03795640bf94531bcb0", + "sha256:daf42de090d59db025af61ce6bdb2521f0f102ea0e6ea310f13c17610a97da4c", + "sha256:dc051506c30b609238d79eda75ee9cab3e520570ec8219844a72a46020901e37", + "sha256:de496365750cc472b4e7902a485d3f152ecf57bd3ba03ddd5578ed8ceb4c5964", + "sha256:dfb874cfa53340009af6bdd7e54ebc0d21012a60a4e65d927c2e477112e63484", + "sha256:e19e0643cc936a22e837f79d01a550678da8377d7d801a14487c10c34ee49c7e", + "sha256:e237b807d68a61fc3b1e845407e27e5eb8ef69bc93fe8505337c1acb4ee300b6", + "sha256:e5867f2651016a3afd8dd2c8238baa66f1e2802f44bc17e236f547ace6647078", + "sha256:e748d4cf8fef2526bb2a589a417eba0c8674e29ffcb570ce2ceca44f1e567bf6", + "sha256:e77dd455b9a16bbd2a5036a63ddbd479c19572af81b624e79ef422f929eef388", + "sha256:e8113639f3296706fbac34a30813929e29247718e88173ad849f57ca59754924", + "sha256:e8cd2415f372e7e5a789d743d133ae474290a90b9023197fd78f32e2dc6873e2", + "sha256:eb2a12d704f180a902d7fa778c6d71f36ceb7b0d317f34cdc76a5d05aa1dd1df", + "sha256:ef9266d2aa545d7374938fb5c484531ef5a2ec7f2d573e62f8ce722c735685fd", + "sha256:f2a50c3c1d11cad0ebebbac357a97b26aa79d2bcaf46f256551152aa85d3a4d1", + "sha256:f2e3b1a6bb38de0bc713edd4d612969dd250ca8b724be8d460001a387507021c", + "sha256:f952dacaa552f3bb8834908dddd500ba7d508e6ea6eb8c52eb2d28f48ca06a31", + "sha256:fa25afbadead523f7001caf0c2382afd272c315a033a7b06336da2637d92d6ed", + "sha256:fb8dae0b6b8b7f9e96c26fdd8121522ce5de9bb5538010870bd538683d30e9a2", + "sha256:fbc74f42c3525ac4ffa4b89cbdd00057b6196bcefe8bce794abd42d33a018092", + "sha256:fe659f6b5d10fb5a17f00a50eb903eb277a71ee35df4615db573c069bcf967ac" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==6.0.2" + }, "matplotlib": { "hashes": [ "sha256:00270d217d6b20d14b584c521f810d60c5c78406dc289859776550df837dcda7", @@ -856,13 +1003,6 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", "version": "==2.9.0.post0" }, - "pytz": { - "hashes": [ - "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", - "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00" - ], - "version": "==2025.2" - }, "pyyaml": { "hashes": [ "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", @@ -982,14 +1122,6 @@ "markers": "python_version >= '3.7'", "version": "==5.0.2" }, - "tzdata": { - "hashes": [ - "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", - "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7" - ], - "markers": "python_version >= '2'", - "version": "==2025.3" - }, "uritemplate": { "hashes": [ "sha256:480c2ed180878955863323eea31b0ede668795de182617fef9c6ca09e6ec9d0e", diff --git a/scripts/1-fetch/arxiv_fetch.py b/scripts/1-fetch/arxiv_fetch.py index 93249652..d462bf89 100755 --- a/scripts/1-fetch/arxiv_fetch.py +++ b/scripts/1-fetch/arxiv_fetch.py @@ -1,24 +1,25 @@ #!/usr/bin/env python """ -Fetch ArXiv papers with CC license information and generate count reports. +Fetch ArXiv articles with CC license information using OAI-PMH API. +OAI-PMH: Open Archives Initiative Protocol for Metadata Havesting. """ # Standard library import argparse import csv import os -import re import sys import textwrap import time import traceback -import urllib.parse from collections import Counter, defaultdict +from copy import copy +from datetime import datetime, timezone from operator import itemgetter # Third-party -import feedparser import requests import yaml +from lxml import etree from pygments import highlight from pygments.formatters import TerminalFormatter from pygments.lexers import PythonTracebackLexer @@ -33,62 +34,8 @@ LOGGER, PATHS = shared.setup(__file__) # Constants -# API Configuration -BASE_URL = "https://export.arxiv.org/api/query?" -DEFAULT_FETCH_LIMIT = 800 # Default total papers to fetch - -# CSV Headers -HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER", "AUTHOR_BUCKET", "COUNT"] -HEADER_CATEGORY_REPORT = [ - "TOOL_IDENTIFIER", - "CATEGORY_CODE", - "CATEGORY_LABEL", - "COUNT", -] -HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"] -HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"] - -# Search Queries -SEARCH_QUERIES = [ - 'all:"creative commons"', - 'all:"CC BY"', - 'all:"CC-BY"', - 'all:"CC BY-NC"', - 'all:"CC-BY-NC"', - 'all:"CC BY-SA"', - 'all:"CC-BY-SA"', - 'all:"CC BY-ND"', - 'all:"CC-BY-ND"', - 'all:"CC BY-NC-SA"', - 'all:"CC-BY-NC-SA"', - 'all:"CC BY-NC-ND"', - 'all:"CC-BY-NC-ND"', - 'all:"CC0"', - 'all:"CC 0"', - 'all:"CC-0"', -] - -# Compiled regex patterns for CC license detection -CC_PATTERNS = [ - (re.compile(r"\bCC[-\s]?0\b", re.IGNORECASE), "CC0"), - ( - re.compile(r"\bCC[-\s]?BY[-\s]?NC[-\s]?ND\b", re.IGNORECASE), - "CC BY-NC-ND", - ), - ( - re.compile(r"\bCC[-\s]?BY[-\s]?NC[-\s]?SA\b", re.IGNORECASE), - "CC BY-NC-SA", - ), - (re.compile(r"\bCC[-\s]?BY[-\s]?ND\b", re.IGNORECASE), "CC BY-ND"), - (re.compile(r"\bCC[-\s]?BY[-\s]?SA\b", re.IGNORECASE), "CC BY-SA"), - (re.compile(r"\bCC[-\s]?BY[-\s]?NC\b", re.IGNORECASE), "CC BY-NC"), - (re.compile(r"\bCC[-\s]?BY\b", re.IGNORECASE), "CC BY"), - ( - re.compile(r"\bCREATIVE\s+COMMONS\b", re.IGNORECASE), - "UNKNOWN CC legal tool", - ), -] - +BASE_URL = "https://oaipmh.arxiv.org/oai" +# Defaults should result in quick operation (not complete operation) # ArXiv Categories - manually curated from ArXiv official taxonomy # Source: https://arxiv.org/category_taxonomy CATEGORIES = { @@ -248,8 +195,9 @@ "nucl-th": "Nuclear Theory", "quant-ph": "Quantum Physics", } - -# File Paths +DEFAULT_FETCH_LIMIT = 1000 +DEFAULT_YEARS_BACK = 5 +# CSV file paths FILE_ARXIV_AUTHOR_BUCKET = shared.path_join( PATHS["data_1-fetch"], "arxiv_4_count_by_author_bucket.csv" ) @@ -260,12 +208,19 @@ FILE_ARXIV_YEAR = shared.path_join( PATHS["data_1-fetch"], "arxiv_3_count_by_year.csv" ) -# records metadata for each run for audit, reproducibility, and provenance FILE_PROVENANCE = shared.path_join( PATHS["data_1-fetch"], "arxiv_provenance.yaml" ) - -# Runtime variables +# CSV headers +HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER", "AUTHOR_BUCKET", "COUNT"] +HEADER_CATEGORY_REPORT = [ + "TOOL_IDENTIFIER", + "CATEGORY_CODE", + "CATEGORY_LABEL", + "COUNT", +] +HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"] +HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"] QUARTER = os.path.basename(PATHS["data_quarter"]) @@ -273,25 +228,12 @@ def parse_arguments(): """Parse command-line options, returns parsed argument namespace. - Note: The --limit parameter sets the total number of papers to fetch - across all search queries, not per query. ArXiv API recommends - maximum of 30000 results per session for optimal performance. + Note: The --limit parameter sets the total number of articles to fetch. + The --years-back parameter limits harvesting to recent years where + CC licensing is more common. """ LOGGER.info("Parsing command-line options") parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--limit", - type=int, - default=DEFAULT_FETCH_LIMIT, - help=( - f"Total limit of papers to fetch across all search queries " - f"(default: {DEFAULT_FETCH_LIMIT}). Maximum recommended: 30000. " - f"Note: Individual queries limited to 500 results " - f"(implementation choice). " - f"See ArXiv API documentation: " - f"https://info.arxiv.org/help/api/user-manual.html" - ), - ) parser.add_argument( "--enable-save", action="store_true", @@ -302,9 +244,46 @@ def parse_arguments(): action="store_true", help="Enable git actions (fetch, merge, add, commit, and push)", ) + parser.add_argument( + "--limit", + type=int, + default=DEFAULT_FETCH_LIMIT, + help=( + "Limit number of fetched articles (default:" + f" {DEFAULT_FETCH_LIMIT}). Use a value of -1 to remove limit." + ), + ) + parser.add_argument( + "--years-back", + type=int, + default=DEFAULT_YEARS_BACK, + help=( + "Number of years back from current year to fetch (default:" + f" {DEFAULT_YEARS_BACK}). Use a value of -1 to specify 2008-02-05" + " (first date a CC licensed article was added)." + ), + ) + args = parser.parse_args() if not args.enable_save and args.enable_git: parser.error("--enable-git requires --enable-save") + # Restrict args.years_back to earliest datetime and initialize + # args.from_date + # + # Survey of records indicated the first CC licenced article was added on + # 2008-02-05 + earliest_date = datetime(2008, 2, 5, tzinfo=timezone.utc) + this_year = datetime.now(timezone.utc).year + if args.years_back == -1: + arg_date = earliest_date + else: + start_year = this_year - args.years_back + arg_date = datetime(start_year, 1, 1, tzinfo=timezone.utc) + if arg_date < earliest_date: + arg_date = earliest_date + args.from_date = arg_date.strftime("%Y-%m-%d") + args.years_back = this_year - arg_date.year + return args @@ -333,196 +312,134 @@ def initialize_all_data_files(args): initialize_data_file(FILE_ARXIV_AUTHOR_BUCKET, HEADER_AUTHOR_BUCKET) -def normalize_license_text(raw_text): - """ - Convert raw license text to standardized CC license identifiers. - - Uses regex patterns to identify CC licenses from paper text. - Returns specific license (e.g., "CC BY", "CC0") or "Unknown". - """ - if not raw_text: - return "Unknown" +def get_license_mapping(): + global LICENSE_MAPPING + LOGGER.info("Loading CC Legal Tool metadata for license mapping") + file_path = shared.path_join(PATHS["data"], "cc-legal-tools.csv") + license_mapping = {} + with open(file_path, "r", encoding="utf-8") as file_obj: + rows = csv.DictReader(file_obj, dialect="unix") + for row in rows: + simple_url = row["CANONICAL_URL"].replace("https://", "") + simple_url = simple_url.rstrip("/") + identifier = row["IDENTIFIER"] + license_mapping[simple_url] = identifier + + # Add legacy entry + simple_url = "creativecommons.org/licenses/publicdomain" + license_mapping[simple_url] = "CERTIFICATION 1.0 US" + + LICENSE_MAPPING = dict( + sorted(license_mapping.items(), key=lambda item: item[1]) + ) - for pattern, license_type in CC_PATTERNS: - if pattern.search(raw_text): - return license_type - return "Unknown" +def extract_record_license(record): + """ + Extract CC license information from OAI-PMH XML record. + Returns normalized license identifier or specific error indicator. + """ + # Find license element in arXiv namespace + license_element = record.find(".//{http://arxiv.org/OAI/arXiv/}license") + + if license_element is not None and license_element.text: + license_url = license_element.text.strip() + simple_url = copy(license_url).replace("http://", "") + simple_url = simple_url.replace("https://", "") + simple_url = simple_url.rstrip("/") + # Check exact mapping first + if simple_url in LICENSE_MAPPING: + return LICENSE_MAPPING[simple_url] + # Validate CC URLs more strictly + elif "creativecommons.org" in license_url.lower(): + return f"CC (ambiguous): {license_url}" + else: + return "Non-CC" + else: + return "No license field" -def extract_license_info(entry): +def extract_record_metadata(record): """ - Extract CC license information from ArXiv paper entry. + Extract paper metadata from OAI-PMH XML record. - Checks rights field first, then summary field for license patterns. - Returns normalized license identifier or "Unknown". + Returns dict with author_count, category, year, and license info. """ - # checking through the rights field first then summary - if hasattr(entry, "rights") and entry.rights: - license_info = normalize_license_text(entry.rights) - if license_info != "Unknown": - return license_info - if hasattr(entry, "summary") and entry.summary: - license_info = normalize_license_text(entry.summary) - if license_info != "Unknown": - return license_info - return "Unknown" - - -def extract_category_from_entry(entry): - """Extract primary category from ArXiv entry.""" - if ( - hasattr(entry, "arxiv_primary_category") - and entry.arxiv_primary_category - ): - return entry.arxiv_primary_category.get("term", "Unknown") - if hasattr(entry, "tags") and entry.tags: - # Get first category from tags - for tag in entry.tags: - if hasattr(tag, "term"): - return tag.term - return "Unknown" - - -def extract_year_from_entry(entry): - """Extract publication year from ArXiv entry.""" - if hasattr(entry, "published") and entry.published: + + # Extract license first to avoid unnecessary work + license_info = extract_record_license(record) + if not license_info.startswith("CC"): + return {} + + # # Extract added on + # added_on_elem = record.find( + # ".//{http://www.openarchives.org/OAI/2.0/}datestamp" + # ) + # if added_on_elem is not None and added_on_elem.text: + # added_on = added_on_elem.text.strip() + + # Extract author count + authors = record.findall(".//{http://arxiv.org/OAI/arXiv/}author") + author_count = len(authors) if authors else 0 + + # Extract category (primary category from categories field) + categories_elem = record.find(".//{http://arxiv.org/OAI/arXiv/}categories") + if categories_elem is not None and categories_elem.text: + # Take first category as primary + category = categories_elem.text.strip().split()[0] + else: + category = "Unknown" + + # Extract year from 1) updated, 2) created + updated_elem = record.find(".//{http://arxiv.org/OAI/arXiv/}updated") + if updated_elem is not None and updated_elem.text: try: - return entry.published[:4] # Extract year from date string + year = updated_elem.text.strip()[:4] # Extract year except (AttributeError, IndexError) as e: - LOGGER.debug( - f"Failed to extract year from '{entry.published}': {e}" + LOGGER.error( + f"Failed to extract year from '{updated_elem.text}': {e}" ) - return "Unknown" - - -def extract_author_count_from_entry(entry): - """Extract number of authors from ArXiv entry.""" - if hasattr(entry, "authors") and entry.authors: - try: - return len(entry.authors) - except Exception as e: - LOGGER.debug(f"Failed to count authors from entry.authors: {e}") - if hasattr(entry, "author") and entry.author: - return 1 - return "Unknown" - + year = "Unknown" + else: + created_elem = record.find(".//{http://arxiv.org/OAI/arXiv/}created") + if created_elem is not None and created_elem.text: + try: + year = created_elem.text.strip()[:4] # Extract year + except (AttributeError, IndexError) as e: + LOGGER.error( + f"Failed to extract year from '{created_elem.text}': {e}" + ) + year = "Unknown" + else: + year = "Unknown" + + metadata = { + # "added_on": added_on, + "author_count": author_count, + "category": category, + "license": license_info, + "year": year, + } + return metadata -def bucket_author_count(n): - """ - Convert author count to predefined buckets for analysis. - Buckets: "1", "2", "3", "4", "5+", "Unknown" - Reduces granularity for better statistical analysis. - """ - if n == 1: - return "1" - if n == 2: - return "2" - if n == 3: - return "3" - if n == 4: - return "4" - if n >= 5: - return "5+" - return "Unknown" - - -def save_count_data( - license_counts, category_counts, year_counts, author_counts -): +def bucket_author_count(author_count): """ - Save all collected data to CSV files. - + Convert author count to predefined buckets: "1", "2", "3", "4", "5+". """ - # license_counts: {license: count} - # category_counts: {license: {category_code: count}} - # year_counts: {license: {year: count}} - # author_counts: {license: {author_count(int|None): count}} - - # Save license counts - data = [] - for lic, c in license_counts.items(): - data.append({"TOOL_IDENTIFIER": lic, "COUNT": c}) - data.sort(key=itemgetter("TOOL_IDENTIFIER")) - with open(FILE_ARXIV_COUNT, "w", encoding="utf-8", newline="\n") as fh: - writer = csv.DictWriter(fh, fieldnames=HEADER_COUNT, dialect="unix") - writer.writeheader() - for row in data: - writer.writerow(row) - - # Save category report with labels - data = [] - for lic, cats in category_counts.items(): - for code, c in cats.items(): - label = CATEGORIES.get(code, code) - data.append( - { - "TOOL_IDENTIFIER": lic, - "CATEGORY_CODE": code, - "CATEGORY_LABEL": label, - "COUNT": c, - } - ) - data.sort(key=itemgetter("TOOL_IDENTIFIER", "CATEGORY_CODE")) - with open( - FILE_ARXIV_CATEGORY_REPORT, "w", encoding="utf-8", newline="\n" - ) as fh: - writer = csv.DictWriter( - fh, fieldnames=HEADER_CATEGORY_REPORT, dialect="unix" - ) - writer.writeheader() - for row in data: - writer.writerow(row) - - # Save year counts - data = [] - for lic, years in year_counts.items(): - for year, c in years.items(): - data.append({"TOOL_IDENTIFIER": lic, "YEAR": year, "COUNT": c}) - data.sort(key=itemgetter("TOOL_IDENTIFIER", "YEAR")) - with open(FILE_ARXIV_YEAR, "w", encoding="utf-8", newline="\n") as fh: - writer = csv.DictWriter(fh, fieldnames=HEADER_YEAR, dialect="unix") - writer.writeheader() - for row in data: - writer.writerow(row) - - # Save author buckets summary - data = [] - for lic, acs in author_counts.items(): - # build buckets across licenses - bucket_counts = Counter() - for ac, c in acs.items(): - b = bucket_author_count(ac) - bucket_counts[b] += c - for b, c in bucket_counts.items(): - data.append( - {"TOOL_IDENTIFIER": lic, "AUTHOR_BUCKET": b, "COUNT": c} - ) - data.sort(key=itemgetter("TOOL_IDENTIFIER", "AUTHOR_BUCKET")) - with open( - FILE_ARXIV_AUTHOR_BUCKET, "w", encoding="utf-8", newline="\n" - ) as fh: - writer = csv.DictWriter( - fh, fieldnames=HEADER_AUTHOR_BUCKET, dialect="unix" - ) - writer.writeheader() - for row in data: - writer.writerow(row) + if author_count <= 4: + return str(author_count) + return "5+" -def query_arxiv(args): +def query_arxiv(args, session): """ - Main function to query ArXiv API and collect CC license data. - + Query ArXiv OAI-PMH API and return information about CC licensed articles. """ - - LOGGER.info("Beginning to fetch results from ArXiv API") - session = shared.get_session() - - results_per_iteration = 50 - - search_queries = SEARCH_QUERIES + LOGGER.info( + f"Querying articles from {args.from_date} onwards ({args.years_back}" + " years back)" + ) # Data structures for counting license_counts = defaultdict(int) @@ -530,116 +447,249 @@ def query_arxiv(args): year_counts = defaultdict(lambda: defaultdict(int)) author_counts = defaultdict(lambda: defaultdict(int)) + batch = 1 total_fetched = 0 + cc_articles_found = 0 + # min_added_on = False + resumption_token = None + + # Proceed is set to False when limit reached or end of records (missing + # resumption token) + proceed = True + while proceed: + if resumption_token: + # Continue with resumption token + query_params = { + "verb": "ListRecords", + "resumptionToken": resumption_token, + } + verb = "resuming" + else: + # Initial request with date range + query_params = { + "verb": "ListRecords", + "metadataPrefix": "arXiv", + "from": args.from_date, + } + verb = "starting" + + # Make API request + LOGGER.info( + f"Fetching batch {batch} {verb} from record {total_fetched}" + ) + batch += 1 - for search_query in search_queries: - if total_fetched >= args.limit: - break - - LOGGER.info(f"Searching for: {search_query}") - papers_found_for_query = 0 - - for start in range( - 0, - min(args.limit - total_fetched, 500), - results_per_iteration, - ): - encoded_query = urllib.parse.quote_plus(search_query) - query = ( - f"search_query={encoded_query}&start={start}" - f"&max_results={results_per_iteration}" + try: + # Build OAI-PMH request URL + response = session.get(BASE_URL, params=query_params, timeout=60) + response.raise_for_status() + except requests.HTTPError as e: + raise shared.QuantifyingException(f"HTTP Error: {e}", 1) + except requests.RequestException as e: + raise shared.QuantifyingException(f"Request Exception: {e}", 1) + + root = etree.fromstring(response.content) + + # Check for errors + error_element = root.find( + ".//{http://www.openarchives.org/OAI/2.0/}error" + ) + if error_element is not None: + raise shared.QuantifyingException( + f"OAI-PMH Error: {error_element.text}", 1 ) - papers_found_in_batch = 0 + # Process batch of article records + records = root.findall( + ".//{http://www.openarchives.org/OAI/2.0/}record" + ) + batch_cc_count = 0 + for record in records: + if args.limit > 0 and args.limit <= total_fetched: + proceed = False + break + total_fetched += 1 - try: - LOGGER.info( - f"Fetching results {start} - " - f"{start + results_per_iteration}" - ) - response = session.get(BASE_URL + query, timeout=30) - response.raise_for_status() - feed = feedparser.parse(response.content) + metadata = extract_record_metadata(record) + if not metadata: # Only true for CC licensed articles + continue - for entry in feed.entries: - if total_fetched >= args.limit: - break + # added_on = metadata["added_on"] + # if not min_added_on or added_on < min_added_on: + # min_added_on = added_on - license_info = extract_license_info(entry) + license_info = metadata["license"] - if license_info != "Unknown": + # Count by author count and license + author_count = metadata["author_count"] + author_counts[license_info][author_count] += 1 - category = extract_category_from_entry(entry) - year = extract_year_from_entry(entry) - author_count = extract_author_count_from_entry(entry) + # Count by category and license + category = metadata["category"] + category_counts[license_info][category] += 1 - # Count by license - license_counts[license_info] += 1 + # Count by license + license_counts[license_info] += 1 - # Count by category and license - category_counts[license_info][category] += 1 + # Count by year and license + year = metadata["year"] + year_counts[license_info][year] += 1 - # Count by year and license - year_counts[license_info][year] += 1 + batch_cc_count += 1 + cc_articles_found += 1 - # Count by author count and license - author_counts[license_info][author_count] += 1 + # if min_added_on: + # LOGGER.info(f" Earliest CC article addition: {min_added_on}") - total_fetched += 1 - papers_found_in_batch += 1 - papers_found_for_query += 1 + LOGGER.info( + f" Batch CC licensed articles: {batch_cc_count}, Total" + f" CC-licensed articles: {cc_articles_found}" + ) - # arXiv recommends a 3-seconds delay between consecutive - # api calls for efficiency - time.sleep(3) - except requests.HTTPError as e: - raise shared.QuantifyingException(f"HTTP Error: {e}", 1) - except requests.RequestException as e: - raise shared.QuantifyingException(f"Request Exception: {e}", 1) - except KeyError as e: - raise shared.QuantifyingException(f"KeyError: {e}", 1) + # Check for resumption token + resumption_element = root.find( + ".//{http://www.openarchives.org/OAI/2.0/}resumptionToken" + ) + if not proceed: + break + elif resumption_element is not None and resumption_element.text: + resumption_token = resumption_element.text + else: + LOGGER.info("No more records available") + proceed = False + break - if papers_found_in_batch == 0: - break + # OAI-PMH requires a 3 second delay between requests + # https://info.arxiv.org/help/api/tou.html#rate-limits + time.sleep(3) + + data = { + "author_counts": author_counts, + "category_counts": category_counts, + "license_counts": license_counts, + "year_counts": year_counts, + } + return data, cc_articles_found - LOGGER.info( - f"Query '{search_query}' completed: " - f"{papers_found_for_query} papers found" - ) - # Save results - if args.enable_save: - save_count_data( - license_counts, category_counts, year_counts, author_counts +def rows_to_csv(args, fieldnames, rows, file_path): + if not args.enable_save: + return args + + with open(file_path, "w", encoding="utf-8", newline="\n") as file_handle: + writer = csv.DictWriter( + file_handle, fieldnames=fieldnames, dialect="unix" ) + writer.writeheader() + for row in rows: + writer.writerow(row) + + +def write_data(args, data): + """ + Write fetched data to CSV files. + """ + # Save author buckets report + # fetched_data["author_counts"]: {license: {author_count: count}} + rows = [] + for license_name, author_count_data in data["author_counts"].items(): + # build buckets across licenses + bucket_counts = Counter() + for author_count, count in author_count_data.items(): + bucket = bucket_author_count(author_count) + bucket_counts[bucket] += count + for bucket, count in bucket_counts.items(): + rows.append( + { + "TOOL_IDENTIFIER": license_name, + "AUTHOR_BUCKET": bucket, + "COUNT": count, + } + ) + rows.sort(key=itemgetter("TOOL_IDENTIFIER", "AUTHOR_BUCKET")) + rows_to_csv(args, HEADER_AUTHOR_BUCKET, rows, FILE_ARXIV_AUTHOR_BUCKET) - # save provenance + # Save category report with labels + # fetched_data["category_counts"]: {license: {category_code: count}} + rows = [] + for license_name, categories in data["category_counts"].items(): + for code, count in categories.items(): + label = CATEGORIES.get(code, code) + rows.append( + { + "TOOL_IDENTIFIER": license_name, + "CATEGORY_CODE": code, + "CATEGORY_LABEL": label, + "COUNT": count, + } + ) + rows.sort(key=itemgetter("TOOL_IDENTIFIER", "CATEGORY_CODE")) + rows_to_csv(args, HEADER_CATEGORY_REPORT, rows, FILE_ARXIV_CATEGORY_REPORT) + + # Save license counts report + # fetched_data["license_counts"]: {license: count} + rows = [] + for license_name, count in data["license_counts"].items(): + rows.append({"TOOL_IDENTIFIER": license_name, "COUNT": count}) + rows.sort(key=itemgetter("TOOL_IDENTIFIER")) + rows_to_csv(args, HEADER_COUNT, rows, FILE_ARXIV_COUNT) + + # Save year count report + # fetched_data["year_counts"]: {license: {year: count}} + rows = [] + for license_name, years in data["year_counts"].items(): + for year, count in years.items(): + rows.append( + {"TOOL_IDENTIFIER": license_name, "YEAR": year, "COUNT": count} + ) + rows.sort(key=itemgetter("TOOL_IDENTIFIER", "YEAR")) + rows_to_csv(args, HEADER_YEAR, rows, FILE_ARXIV_YEAR) + + +def write_provence(args, cc_articles_found): + """ + Write provenance information to YAML file. + """ + if not args.enable_save: + return args + + # Save provenance + desc = "Open Archives Initiative Protocol for Metadata Havesting (OAI-PMH)" provenance_data = { - "total_fetched": total_fetched, - "queries": search_queries, - "limit": args.limit, + "api_description": desc, + "api_endpoint": BASE_URL, + "arguments": { + "from_date": args.from_date, + "limit": args.limit, + "years_back": args.years_back, + }, + "cc_articles_found": cc_articles_found, "quarter": QUARTER, "script": os.path.basename(__file__), } - # write provenance YAML for auditing - try: - with open(FILE_PROVENANCE, "w", encoding="utf-8", newline="\n") as fh: - yaml.dump(provenance_data, fh, default_flow_style=False, indent=2) - except Exception as e: - LOGGER.warning("Failed to write provenance file: %s", e) - - LOGGER.info(f"Total CC licensed papers fetched: {total_fetched}") + # Write provenance YAML for auditing + with open( + FILE_PROVENANCE, "w", encoding="utf-8", newline="\n" + ) as file_handle: + yaml.dump( + provenance_data, + file_handle, + default_flow_style=False, + indent=2, + ) def main(): - """Main function.""" - LOGGER.info("Script execution started.") args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) initialize_all_data_files(args) - query_arxiv(args) + get_license_mapping() + session = shared.get_session() + data, cc_articles_found = query_arxiv(args, session) + write_data(args, data) + write_provence(args, cc_articles_found) args = shared.git_add_and_commit( args, PATHS["repo"], diff --git a/sources.md b/sources.md index 04d6ada1..43335103 100644 --- a/sources.md +++ b/sources.md @@ -6,21 +6,28 @@ public domain. Below are the sources and their respective information: ## arXiv -**Description:** arXiv is a free distribution service and an open-access archive for scholarly articles in physics, mathematics, computer science, quantitative biology, quantitative finance, statistics, electrical engineering and systems science, and economics. All arXiv articles are available under various open licenses or are in the public domain. +**Description:** arXiv is a free distribution service and an open-access +archive for scholarly articles in physics, mathematics, computer science, +quantitative biology, quantitative finance, statistics, electrical engineering +and systems science, and economics. All arXiv articles are available under +various open licenses or are in the public domain. **API documentation link:** -- [arXiv API User Manual](https://arxiv.org/help/api/user-manual) -- [arXiv API Reference](https://arxiv.org/help/api) -- [Base URL](http://export.arxiv.org/api/query) +- [arXiv OAI-PMH Interface](https://info.arxiv.org/help/oa/index.html) +- [Base URL (OAI-PMH)](https://oaipmh.arxiv.org/oai) - [arXiv Subject Classifications](https://arxiv.org/category_taxonomy) - [Terms of Use for arXiv APIs](https://info.arxiv.org/help/api/tou.html) **API information:** - No API key required - Query limit: No official limit, but requests should be made responsibly -- Data available through Atom XML format -- Supports search by fields: title (ti), author (au), abstract (abs), comment (co), journal reference (jr), subject category (cat), report number (rn), id, all (searches all fields), and submittedDate (date filter) -- Metadata includes licensing information for each paper +- **Data format**: OAI-PMH XML format with structured metadata fields +- **OAI-PMH Interface** (used by `arxiv_fetch.py`): + - Structured metadata harvesting with resumption tokens + - License information extracted from `{http://arxiv.org/OAI/arXiv/}license` XML field + - Recommended 3-second delays between requests + - Supports date-based filtering for bulk harvesting +- Metadata includes comprehensive licensing information for each paper ## CC Legal Tools