diff --git a/.gitignore b/.gitignore index 79e5387c..582de118 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ test *.csv # LLM Tools .claude +# Link checker reports +link-reports/ diff --git a/docs/bee/faq.md b/docs/bee/faq.md index fa999db4..a289bdfc 100644 --- a/docs/bee/faq.md +++ b/docs/bee/faq.md @@ -165,7 +165,7 @@ Therefore, the rule is, each node must have: ### How can I add Gnosis / Sepolia to Metamask? -You can easily add Sepolia or Gnosis to metamask using the [official guide from Metamask](https://support.metamask.io/networks-and-sidechains/managing-networks/how-to-add-a-custom-network-rpc/). +You can easily add Sepolia or Gnosis to metamask using the [official guide from Metamask](https://support.metamask.io/configure/networks/how-to-add-a-custom-network-rpc/). If you are using a different wallet which does not have an easy option for adding networks like Metamask does, then you may need to add the networks manually. You need to fill in four pieces of information to do so: diff --git a/docs/bee/installation/connectivity.md b/docs/bee/installation/connectivity.md index eba8bd1b..5b5afc6d 100644 --- a/docs/bee/installation/connectivity.md +++ b/docs/bee/installation/connectivity.md @@ -17,7 +17,7 @@ swarm, below you will find a detailed guide to navigating your way through your network and making it out into the wild so you can buzz around fellow bees and maximize your chances of earning xBZZ. If you still have problems, please join us in our [Discord -server](https://discord.gg/wdghaQsGq5) and we'll help you find the +server](https://discord.gg/kHRyMNpw7t) and we'll help you find the way! 🐝 🐝 🐝 🐝 🐝 :::warning diff --git a/docs/bee/installation/hive.md b/docs/bee/installation/hive.md index 969cdaf4..cff46d8d 100644 --- a/docs/bee/installation/hive.md +++ b/docs/bee/installation/hive.md @@ -37,4 +37,4 @@ Configure your nodes as desired, but ensure that the values `api-addr`, `data-di ### Monitoring -See the [logging section](./../working-with-bee/logs-and-files.md) for more information on how to access your node's metrics. Share your community creations (such as [swarmMonitor](https://github.com/doristeo/SwarmMonitoring) - thanks doristeo!) in the [#node-operators](https://discord.gg/X3ph5yGRFU) channel of our Discord server so we can add you to our list of all things that are [awesome](https://github.com/ethersphere/awesome-swarm) and Swarm. 🧑 +See the [logging section](./../working-with-bee/logs-and-files.md) for more information on how to access your node's metrics. Share your community creations (such as [swarmMonitor](https://github.com/doristeo/SwarmMonitoring) - thanks doristeo!) in the [#node-operators](https://discord.gg/kHRyMNpw7t) channel of our Discord server so we can add you to our list of all things that are [awesome](https://github.com/ethersphere/awesome-swarm) and Swarm. 🧑 diff --git a/docs/bee/installation/package-manager.md b/docs/bee/installation/package-manager.md index 55be3581..9b9ac07c 100644 --- a/docs/bee/installation/package-manager.md +++ b/docs/bee/installation/package-manager.md @@ -106,7 +106,7 @@ Config: /etc/bee/bee.yaml Bee requires a Gnosis Chain RPC endpoint to function. By default this is expected to be found at ws://localhost:8546. -Please see https://docs.ethswarm.org/docs/installation/install for more details on how to configure your node. +Please see https://docs.ethswarm.org/docs/bee/installation/getting-started for more details on how to configure your node. After you finish configuration run 'sudo bee-get-addr' and fund your node with XDAI, and also XBZZ if so desired. @@ -122,7 +122,7 @@ When Bee is installed using a package manager, a `bee.yaml` file containing the While this package manager install guide uses the `bee.yaml` file for setting configuration options, there are [several other available methods for setting node options](./../working-with-bee/configuration.md). ::: -After installation, you can check that the file was successfully generated and contains the [default configuration](https://github.com/ethersphere/bee/blob/master/packaging) for your system: +After installation, you can check that the file was successfully generated and contains the [default configuration](https://github.com/ethersphere/bee/tree/master/packaging) for your system: ]*href=["\']([^"\']+)["\']', content, re.IGNORECASE): + links.append(('', m.group(1))) + for m in re.finditer(r']*src=["\']([^"\']+)["\']', content, re.IGNORECASE): + links.append(('', m.group(1))) + + # Bare URLs β€” plain http(s) URLs not inside a markdown link or HTML attribute. + # Collect all URL positions already captured above to avoid double-reporting. + seen_spans = set() + for m in re.finditer(r'\[([^\]]*)\]\(([^)]+)\)', content): + seen_spans.add(m.start(2)) + for m in re.finditer(r'^\[([^\]]+)\]:\s*(\S+)', content, re.MULTILINE): + seen_spans.add(m.start(2)) + for m in re.finditer(r'(?:href|src)=["\']([^"\']+)["\']', content, re.IGNORECASE): + seen_spans.add(m.start(1)) + + for m in re.finditer(r'https?://[^\s\]>"\'\\<*`]+', content): + if m.start() not in seen_spans: + url = m.group(0).rstrip('.,;:!') + # Strip trailing unbalanced close-parens + while url.endswith(')') and url.count('(') < url.count(')'): + url = url[:-1] + links.append(('', url)) + + return links + + +# ───────────────────────────────────────────── +# Helpers β€” build-output link resolution +# ───────────────────────────────────────────── + +def _frontmatter_id(md_file): + """Return the 'id' value from YAML frontmatter, or None.""" + try: + text = md_file.read_text(encoding='utf-8', errors='replace') + if not text.startswith('---'): + return None + end = text.find('\n---', 3) + if end == -1: + return None + for line in text[3:end].splitlines(): + if line.startswith('id:'): + return line[3:].strip().strip('"\'') + except Exception: + pass + return None + + +def _build_docid_map(): + """ + Scan all HTML files in the build and return a dict {doc_id: html_path}. + + Docusaurus embeds the doc ID in the class as 'docs-doc-id-{id}', + e.g. class="... docs-doc-id-concepts/DISC/disc ...". + This is the ground truth for what page is at what path β€” no inference needed. + """ + mapping = {} + if not BUILD_DIR.exists(): + return mapping + for html_file in BUILD_DIR.rglob('index.html'): + try: + # Only read the opening tag (first ~500 bytes) for performance + with html_file.open(encoding='utf-8', errors='replace') as fh: + head = fh.read(800) + m = re.search(r'docs-doc-id-([^\s"\']+)', head) + if m: + mapping[m.group(1)] = html_file + except Exception: + pass + return mapping + + +# Populated once at first call to md_path_to_build_html() +_DOCID_MAP = None + + +def md_path_to_build_html(md_file): + """Map a source .md/.mdx file to the HTML file Docusaurus built from it. + + Uses the build's own HTML files (via the embedded docs-doc-id class) as the + authoritative source β€” no path inference or slug computation. + + Falls back to a computed path when the build map lookup misses. + """ + global _DOCID_MAP + if _DOCID_MAP is None: + _DOCID_MAP = _build_docid_map() + + try: + rel = md_file.relative_to(DOCS_DIR) + except ValueError: + return None + + # Compute the full doc ID: parent/local_id + local_id = _frontmatter_id(md_file) or rel.with_suffix('').name + parent = str(rel.parent).replace('\\', '/') + doc_id = local_id if parent == '.' else f"{parent}/{local_id}" + + # Look up in the reverse map first (authoritative) + if doc_id in _DOCID_MAP: + return _DOCID_MAP[doc_id] + + # Fallback: compute expected path + parent_path = rel.parent + if local_id == 'index': + return BUILD_DIR / 'docs' / parent_path / 'index.html' + return BUILD_DIR / 'docs' / parent_path / local_id / 'index.html' + + +def resolve_internal_to_build_html(source_md, link_path): + """Resolve an internal (non-http) link path to the build HTML file it corresponds to. + + Checks the build/ directory only β€” no slug inference, no source-file guessing. + Returns (html_path_or_None, error_reason_or_None). + Caller is responsible for splitting off any '#anchor' before calling. + """ + decoded = unquote(link_path) + + # ── Absolute path (/docs/… or /static/…) ── + if decoded.startswith('/'): + rel = decoded.lstrip('/') + candidates = [ + BUILD_DIR / rel, + BUILD_DIR / rel / 'index.html', + BUILD_DIR / (rel + '.html'), + ] + for c in candidates: + if c.exists() and c.is_file(): + return c, None + return None, f"Not found in build: /{rel}" + + # ── Relative path ── + target = (source_md.parent / decoded).resolve() + + # Non-markdown file (image, PDF, asset): check static/ and on-disk path + if target.suffix not in ('', '.md', '.mdx'): + if target.exists(): + return target, None + try: + static_candidate = STATIC_DIR / target.relative_to(PROJECT_DIR) + if static_candidate.exists(): + return static_candidate, None + except ValueError: + pass + return None, f"File not found: {target.name}" + + # Markdown / no extension: find source file β†’ map to build HTML + md_candidates = ( + [target] if target.suffix in ('.md', '.mdx') + else [target.with_suffix('.md'), target.with_suffix('.mdx'), + target / 'index.md', target / 'index.mdx'] + ) + for md_cand in md_candidates: + if md_cand.exists() and md_cand.is_file(): + build_html = md_path_to_build_html(md_cand) + if build_html is None: + return None, "Could not map source file to build path" + if build_html.exists(): + return build_html, None + return None, "Source file exists but its build HTML was not found β€” is the build current?" + + return None, "Source file not found" + + +def resolve_site_url_locally(url): + """Check a full docs.ethswarm.org URL against the local build output.""" + parsed = urlparse(url) + rel = parsed.path.rstrip('/').lstrip('/') + candidates = [ + BUILD_DIR / rel, + BUILD_DIR / rel / 'index.html', + BUILD_DIR / (rel + '.html'), + ] + for c in candidates: + if c.exists() and c.is_file(): + return True, str(c) + return False, str(BUILD_DIR / rel) + + +# ───────────────────────────────────────────── +# External URL checker +# ───────────────────────────────────────────── + +EXT_STATUS_OK = 'ok' +EXT_STATUS_404 = '404' +EXT_STATUS_DOWN = 'down' +EXT_STATUS_REDIRECT = 'redirect' +EXT_STATUS_ERROR = 'error' +EXT_STATUS_INTERNAL = 'internal_404' # full site URL that resolves locally but build says 404 + + +class _NoFollowRedirectHandler(HTTPRedirectHandler): + """Prevent urllib from automatically following redirects.""" + def redirect_request(self, req, fp, code, msg, headers, newurl): + return None # returning None makes urllib raise HTTPError with the 3xx code + + +def _build_no_redirect_opener(): + return build_opener(_NoFollowRedirectHandler()) + + +def _fetch(url, headers, method='HEAD', follow_redirects=False): + """ + Make a single HTTP request. + + follow_redirects=False: do not follow redirects; 3xx responses return the + code and Location header so the caller can decide what to do. + follow_redirects=True: follow the full redirect chain (standard urlopen behaviour). + + Returns (status_code_or_None, final_url, location_header_or_None, error_str_or_None). + """ + try: + req = Request(url, headers=headers, method=method) + if follow_redirects: + with urlopen(req, timeout=EXT_TIMEOUT) as resp: + return resp.status, resp.url, None, None + else: + opener = _build_no_redirect_opener() + with opener.open(req, timeout=EXT_TIMEOUT) as resp: + return resp.status, url, resp.headers.get('Location'), None + except HTTPError as e: + loc = e.headers.get('Location') if hasattr(e, 'headers') and e.headers else None + return e.code, url, loc, None + except (URLError, socket.timeout, socket.error, ConnectionRefusedError, + http.client.RemoteDisconnected, http.client.IncompleteRead) as e: + return None, url, None, str(e) + except Exception as e: + return None, url, None, f'{type(e).__name__}: {str(e)[:80]}' + + +def _classify_connection_error(result, err): + """Populate result with the right status for a network-level error string.""" + if 'ECONNREFUSED' in err or 'Connection refused' in err: + result['status'] = EXT_STATUS_DOWN + result['error_msg'] = 'ECONNREFUSED β€” server down' + elif ('Name or service not known' in err or 'getaddrinfo' in err + or 'nodename' in err.lower() or 'No address' in err): + result['status'] = EXT_STATUS_DOWN + result['error_msg'] = 'DNS resolution failed' + elif 'timed out' in err.lower() or 'timeout' in err.lower(): + result['status'] = EXT_STATUS_DOWN + result['error_msg'] = 'Connection timed out' + elif 'SSL' in err or 'ssl' in err: + result['status'] = EXT_STATUS_DOWN + result['error_msg'] = f'SSL error: {err[:80]}' + else: + result['status'] = EXT_STATUS_DOWN + result['error_msg'] = f'Connection error: {err[:80]}' + return result + + +def _check_destination(dest_url, headers): + """ + Verify that a redirect destination is actually reachable (200). + Follows the full redirect chain from dest_url. + Returns (status_code_or_None, final_url, error_str_or_None). + """ + code, final, _, err = _fetch(dest_url, headers, method='HEAD', follow_redirects=True) + if err: + return None, dest_url, err + if code in (403, 405): + # Some servers reject HEAD β€” retry with GET + code, final, _, err = _fetch(dest_url, headers, method='GET', follow_redirects=True) + if err: + return None, dest_url, err + return code, final or dest_url, None + + +def check_external_url(url): + """ + Check a single external URL. + + Strategy: + 1. HEAD request WITHOUT following redirects so we can see whether + the URL itself redirects (and where). + 2. If 3xx: explicitly fetch the redirect destination and verify it + returns 200. Only report as EXT_STATUS_REDIRECT if the destination + is reachable. A redirect that leads to a 404/down is reported as + the appropriate broken status. + 3. If HEAD is rejected (403/405): retry with GET, same logic. + + Returns dict: {url, status, http_code, final_url, error_msg} + """ + result = { + 'url': url, + 'status': EXT_STATUS_ERROR, + 'http_code': None, + 'final_url': None, + 'error_msg': None, + } + + # Special case: links to our own live site β€” check against local build + parsed = urlparse(url) + if parsed.netloc == SITE_DOMAIN: + exists, tried = resolve_site_url_locally(url) + if exists: + result['status'] = EXT_STATUS_OK + else: + result['status'] = EXT_STATUS_INTERNAL + result['error_msg'] = f"Not in local build: {tried}" + return result + + headers = { + 'User-Agent': USER_AGENT, + 'Accept': 'text/html,application/xhtml+xml,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + } + + # ── Step 1: initial request (no auto-redirect) ── + code, _, location, err = _fetch(url, headers, method='HEAD', follow_redirects=False) + + if err: + return _classify_connection_error(result, err) + + # HEAD rejected β†’ retry with GET (no auto-redirect) + if code in (403, 405): + code, _, location, err = _fetch(url, headers, method='GET', follow_redirects=False) + if err: + return _classify_connection_error(result, err) + if code in (403, 405): + result['status'] = EXT_STATUS_ERROR + result['http_code'] = code + result['error_msg'] = f"HTTP {code} (GET retry)" + result['final_url'] = url + return result + + result['http_code'] = code + + # ── Step 2: classify based on response code ── + if code is None: + result['status'] = EXT_STATUS_ERROR + return result + + if code == 200: + result['status'] = EXT_STATUS_OK + result['final_url'] = url + + elif code == 404: + result['status'] = EXT_STATUS_404 + result['error_msg'] = 'HTTP 404' + result['final_url'] = url + + elif code in (301, 302, 303, 307, 308): + # ── Redirect: verify the destination is actually reachable ── + dest = location or url + # Make dest absolute if it's a relative Location header + if dest and not dest.startswith('http'): + p = urlparse(url) + dest = f"{p.scheme}://{p.netloc}{dest}" + + dest_code, dest_final, dest_err = _check_destination(dest, headers) + + if dest_err: + result['status'] = EXT_STATUS_DOWN + result['error_msg'] = f"Redirect to {dest!r} failed: {dest_err[:80]}" + result['final_url'] = dest + elif dest_code is None: + result['status'] = EXT_STATUS_DOWN + result['error_msg'] = f"Redirect destination unreachable: {dest!r}" + result['final_url'] = dest + elif dest_code == 200: + if _urls_differ_meaningfully(url, dest_final): + result['status'] = EXT_STATUS_REDIRECT + result['final_url'] = dest_final + else: + result['status'] = EXT_STATUS_OK + result['final_url'] = dest_final + elif dest_code == 404: + result['status'] = EXT_STATUS_404 + result['error_msg'] = f"Redirect target returned 404 ({dest!r})" + result['final_url'] = dest + else: + result['status'] = EXT_STATUS_ERROR + result['error_msg'] = f"Redirect target returned HTTP {dest_code}" + result['final_url'] = dest + + else: + # Any other 2xx is fine; other codes treated as errors + if 200 <= code < 300: + result['status'] = EXT_STATUS_OK + result['final_url'] = url + else: + result['status'] = EXT_STATUS_ERROR + result['error_msg'] = f"HTTP {code}" + result['final_url'] = url + + return result + + +def _urls_differ_meaningfully(original, final): + """True if the URLs differ in a way that's worth reporting (not just httpβ†’https or trailing slash).""" + if not final or original == final: + return False + o = urlparse(original) + f = urlparse(final) + o_path = o.path.rstrip('/') + f_path = f.path.rstrip('/') + # Same host+path, only scheme or trailing-slash differs β†’ not meaningful + if o.netloc == f.netloc and o_path == f_path and o.query == f.query: + return False + # http β†’ https upgrade on same host/path β†’ not meaningful + if (o.netloc == f.netloc and o_path == f_path + and o.scheme == 'http' and f.scheme == 'https'): + return False + return True + + +def check_external_urls_threaded(url_to_sources, threads=EXT_THREADS): + """ + Check a dict of {url: [source_files]} concurrently. + Returns dict of {url: check_result_dict}. + """ + urls = list(url_to_sources.keys()) + results = {} + lock = threading.Lock() + q = queue.Queue() + + for url in urls: + q.put(url) + + total = len(urls) + done = [0] + + def worker(): + while True: + try: + url = q.get_nowait() + except queue.Empty: + break + time.sleep(EXT_DELAY) + res = check_external_url(url) + with lock: + results[url] = res + done[0] += 1 + n = done[0] + if n % 10 == 0 or n == total: + print(f" External: {n}/{total} checked...", end='\r', flush=True) + q.task_done() + + thread_list = [threading.Thread(target=worker, daemon=True) for _ in range(min(threads, len(urls)))] + for t in thread_list: + t.start() + for t in thread_list: + t.join() + + print() # newline after \r progress + return results + + +# ───────────────────────────────────────────── +# Markdown file checker +# ───────────────────────────────────────────── + +def check_markdown_files(check_external=True): + """ + Scan all .md/.mdx source files. + + Internal links are verified against the BUILD output: + - page existence: does the corresponding build HTML file exist? + - anchor existence: is the anchor present as an id attribute in the rendered HTML? + No slug inference is performed at any point. + + Returns: + - broken_internal: list of broken internal link dicts + - external_url_to_sources: dict {url: [(source_file, link_text)]} + - stats + """ + broken_internal = [] + external_url_to_src = defaultdict(list) + files_checked = 0 + links_checked = 0 + html_id_cache = {} # str(html_path) β†’ frozenset of id strings + + if not BUILD_DIR.exists(): + print(" WARNING: build/ directory not found.") + print(" Run 'npm run build' first β€” internal links cannot be checked without it.") + + md_files = sorted(list(DOCS_DIR.rglob('*.md')) + list(DOCS_DIR.rglob('*.mdx'))) + + for md_file in md_files: + files_checked += 1 + try: + content = md_file.read_text(encoding='utf-8', errors='replace') + except Exception as e: + broken_internal.append({ + 'source': str(md_file), 'link_text': '', 'link_url': '', + 'resolved': '', 'reason': f'Could not read file: {e}', + }) + continue + + # Build HTML for this source file β€” used for anchor-only (#frag) links + source_build_html = md_path_to_build_html(md_file) + + links = extract_md_links(content) + + for link_text, url in links: + url = url.strip() + if not url or url == '#': + continue + if any(url.startswith(s) for s in IGNORE_SCHEMES): + continue + + parsed_url = urlparse(url) + if any(parsed_url.hostname and parsed_url.hostname.startswith(h) for h in IGNORE_HOSTS): + continue + + links_checked += 1 + + # ── External / self-site links ── + if any(url.startswith(s) for s in EXTERNAL_SCHEMES): + if check_external: + external_url_to_src[url].append((str(md_file), link_text)) + continue + + # ── Split anchor from path ── + anchor = None + link_path = url + if '#' in link_path: + link_path, anchor = link_path.split('#', 1) + + # ── Determine target build HTML ── + if not link_path: + # Anchor-only link β€” same page + target_html = source_build_html + else: + target_html, reason = resolve_internal_to_build_html(md_file, link_path) + if reason or target_html is None or not target_html.exists(): + broken_internal.append({ + 'source': str(md_file), + 'link_text': link_text, + 'link_url': url, + 'resolved': str(target_html) if target_html else link_path, + 'reason': reason or 'Build HTML not found', + }) + continue + + # ── Check anchor in rendered HTML ── + if anchor and target_html and target_html.exists(): + key = str(target_html) + if key not in html_id_cache: + html_id_cache[key] = get_html_ids(target_html) + if anchor not in html_id_cache[key]: + broken_internal.append({ + 'source': str(md_file), + 'link_text': link_text, + 'link_url': url, + 'resolved': f'{target_html}#{anchor}', + 'reason': f'Anchor "#{anchor}" not found in rendered HTML', + }) + + return broken_internal, dict(external_url_to_src), files_checked, links_checked, len(md_files) + + +# ───────────────────────────────────────────── +# HTML build checker +# ───────────────────────────────────────────── + +class LinkExtractor(HTMLParser): + def __init__(self): + super().__init__() + self.links = [] + self.ids = set() + + def handle_starttag(self, tag, attrs): + attrs_dict = dict(attrs) + if 'id' in attrs_dict: + self.ids.add(attrs_dict['id']) + if tag == 'a' and 'href' in attrs_dict: + self.links.append(('href', attrs_dict['href'])) + elif tag in ('img', 'script') and 'src' in attrs_dict: + self.links.append(('src', attrs_dict['src'])) + elif tag == 'link' and 'href' in attrs_dict: + self.links.append(('href', attrs_dict['href'])) + + +def get_html_ids(html_file): + try: + content = html_file.read_text(encoding='utf-8', errors='replace') + except Exception: + return set() + parser = LinkExtractor() + parser.feed(content) + return parser.ids + + +def resolve_html_link(source_html, href, build_root): + anchor = None + if '#' in href: + href, anchor = href.split('#', 1) + + href = unquote(href) + if not href: + return None, anchor, None + + if href.startswith('/'): + rel = href.lstrip('/') + target = build_root / rel + candidates = [target] + if target.suffix == '': + candidates.append(target / 'index.html') + else: + source_dir = source_html.parent + target = (source_dir / href).resolve() + candidates = [target] + if target.suffix == '': + candidates.append(target / 'index.html') + + for c in candidates: + if c.exists() and c.is_file(): + return c, anchor, None + return target, anchor, "File not found" + + +def check_html_files(): + broken = [] + files_checked = 0 + links_checked = 0 + id_cache = {} + + html_files = sorted(BUILD_DIR.rglob('*.html')) + + for html_file in html_files: + files_checked += 1 + try: + content = html_file.read_text(encoding='utf-8', errors='replace') + except Exception as e: + broken.append({'source': str(html_file), 'attr': 'href', 'link_url': '', + 'resolved': '', 'reason': f'Could not read: {e}'}) + continue + + parser = LinkExtractor() + parser.feed(content) + file_ids = parser.ids + + for attr, url in parser.links: + url = url.strip() + if not url or url == '#': + continue + if any(url.startswith(s) for s in EXTERNAL_SCHEMES + IGNORE_SCHEMES + ('data:',)): + continue + + links_checked += 1 + + if url.startswith('#'): + anchor = url[1:] + if anchor and anchor not in file_ids: + broken.append({'source': str(html_file), 'attr': attr, 'link_url': url, + 'resolved': f'{html_file}#{anchor}', + 'reason': f'Anchor "#{anchor}" not found in same page'}) + continue + + resolved, anchor, reason = resolve_html_link(html_file, url, BUILD_DIR) + + if reason: + broken.append({'source': str(html_file), 'attr': attr, 'link_url': url, + 'resolved': str(resolved) if resolved else url, 'reason': reason}) + continue + + if anchor and resolved and resolved.exists(): + key = str(resolved) + if key not in id_cache: + id_cache[key] = get_html_ids(resolved) + if anchor not in id_cache[key]: + broken.append({'source': str(html_file), 'attr': attr, 'link_url': url, + 'resolved': f'{resolved}#{anchor}', + 'reason': f'Anchor "#{anchor}" not found in target HTML'}) + + return broken, files_checked, links_checked, len(html_files) + + +# ───────────────────────────────────────────── +# Deduplication +# ───────────────────────────────────────────── + +def deduplicate_html_broken(broken): + groups = defaultdict(list) + for item in broken: + groups[(item['link_url'], item['reason'])].append(item) + result = [] + for (url, reason), items in sorted(groups.items()): + rep = dict(items[0]) + rep['count'] = len(items) + rep['example_sources'] = [it['source'] for it in items[:3]] + result.append(rep) + return result + + +# ───────────────────────────────────────────── +# Report +# ───────────────────────────────────────────── + +def make_short_path(path_str, base): + try: + return str(Path(path_str).relative_to(base)) + except ValueError: + try: + return str(Path(path_str).relative_to(PROJECT_DIR)) + except ValueError: + return path_str + + +def write_report( + md_broken, ext_results, ext_url_to_src, + md_files_checked, md_links_checked, md_total_files, + html_broken, html_files_checked, html_links_checked, html_total_files, + staged_replacements=None, +): + import datetime + today = datetime.date.today().isoformat() + + # Categorise external results + ext_404 = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_404} + ext_down = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_DOWN} + ext_redirect = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_REDIRECT} + ext_internal = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_INTERNAL} + ext_error = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_ERROR} + + _staged = staged_replacements or {} + + def _repl(url, res=None): + if url in _staged: + return _staged[url] + final = (res or {}).get('final_url') or '' + return final if final and final != url else '' + + deduped_html = deduplicate_html_broken(html_broken) + + lines = [] + lines.append("# Dead Links Report\n") + lines.append(f"Generated: {today}\n") + lines.append("") + + # ── Summary ── + lines.append("## Summary\n") + lines.append("| Category | Count |") + lines.append("|---|---|") + lines.append(f"| Source doc files checked | {md_files_checked} / {md_total_files} |") + lines.append(f"| Internal links checked (source) | {md_links_checked} |") + lines.append(f"| **Broken internal links (source)** | **{len(md_broken)}** |") + lines.append(f"| External URLs checked | {len(ext_results)} |") + lines.append(f"| **External 404s** | **{len(ext_404) + len(ext_internal)}** |") + lines.append(f"| **External down / refused** | **{len(ext_down)}** |") + lines.append(f"| **Stale redirects** | **{len(ext_redirect)}** |") + lines.append(f"| External errors (timeout/misc) | {len(ext_error)} |") + lines.append(f"| Build HTML files checked | {html_files_checked} / {html_total_files} |") + lines.append(f"| **Broken links in build output** | **{len(deduped_html)} patterns** |") + lines.append("") + + # ── Section 1: Internal broken links ── + lines.append("---\n") + lines.append("## Section 1: Broken Internal Links in Source Docs\n") + + if not md_broken: + lines.append("_No broken internal links._\n") + else: + by_file = defaultdict(list) + for item in md_broken: + by_file[item['source']].append(item) + for source in sorted(by_file): + short = make_short_path(source, DOCS_DIR) + lines.append(f"### `{short}`\n") + lines.append("| Link Text | URL | Resolved Path | Reason |") + lines.append("|---|---|---|---|") + for item in by_file[source]: + text = item['link_text'].replace('|', '\\|')[:60] + url = item['link_url'].replace('|', '\\|')[:80] + resolved = make_short_path(item['resolved'], DOCS_DIR).replace('|', '\\|')[:100] + reason = item['reason'].replace('|', '\\|') + lines.append(f"| {text} | `{url}` | `{resolved}` | {reason} |") + lines.append("") + + # ── Section 2: External 404s ── + lines.append("---\n") + lines.append("## Section 2: External 404s\n") + + all_404 = {**ext_404, **ext_internal} + if not all_404: + lines.append("_No external 404s found._\n") + else: + lines.append("| URL | Replacement URL | Notes | Instances (Link Text β€” File) |") + lines.append("|---|---|---|---|") + for url, res in sorted(all_404.items()): + instances = _fmt_instances(ext_url_to_src.get(url, [])) + code_str = f"HTTP {res['http_code']}" if res['http_code'] else (res['error_msg'] or '') + if res['status'] == EXT_STATUS_INTERNAL: + code_str = "Not found in local build" + repl = _repl(url, res) + lines.append(f"| `{url[:100]}` | {repl} | {code_str} | {instances} |") + lines.append("") + + # ── Section 3: Down / refused ── + lines.append("---\n") + lines.append("## Section 3: Down / Connection Refused\n") + + if not ext_down: + lines.append("_No unreachable external links._\n") + else: + lines.append("| URL | Replacement URL | Error | Instances (Link Text β€” File) |") + lines.append("|---|---|---|---|") + for url, res in sorted(ext_down.items()): + instances = _fmt_instances(ext_url_to_src.get(url, [])) + err = res.get('error_msg', '') or '' + repl = _repl(url, res) + lines.append(f"| `{url[:100]}` | {repl} | {err} | {instances} |") + lines.append("") + + # ── Section 4: Stale redirects ── + lines.append("---\n") + lines.append("## Section 4: Stale Redirects (Update to Final URL)\n") + + if not ext_redirect: + lines.append("_No stale redirects found._\n") + else: + lines.append("| Original URL | Redirects To | Instances (Link Text β€” File) |") + lines.append("|---|---|---|") + for url, res in sorted(ext_redirect.items()): + instances = _fmt_instances(ext_url_to_src.get(url, [])) + repl = _repl(url, res) + lines.append(f"| `{url[:80]}` | `{repl[:80]}` | {instances} |") + lines.append("") + + # ── Section 5: Errors / timeouts ── + if ext_error: + lines.append("---\n") + lines.append("## Section 5: External Check Errors (timeout / misc)\n") + lines.append("| URL | Error | Instances (Link Text β€” File) |") + lines.append("|---|---|---|") + for url, res in sorted(ext_error.items()): + instances = _fmt_instances(ext_url_to_src.get(url, [])) + err = res.get('error_msg', '') or '' + lines.append(f"| `{url[:100]}` | {err} | {instances} |") + lines.append("") + + # ── Section 6: Build HTML broken links ── + lines.append("---\n") + lines.append("## Section 6: Broken Links in Build Output\n") + lines.append("_Deduplicated by (url, reason) pattern._\n") + + if not deduped_html: + lines.append("_No broken links in build output._\n") + else: + lines.append("| Count | URL | Reason | Example Source |") + lines.append("|---|---|---|---|") + for item in sorted(deduped_html, key=lambda x: -x['count']): + url = item['link_url'].replace('|', '\\|')[:80] + reason = item['reason'].replace('|', '\\|') + example = make_short_path(item['example_sources'][0], BUILD_DIR).replace('|', '\\|')[:80] + lines.append(f"| {item['count']} | `{url}` | {reason} | `{example}` |") + lines.append("") + + REPORT_PATH.parent.mkdir(parents=True, exist_ok=True) + REPORT_PATH.write_text('\n'.join(lines), encoding='utf-8') + print(f"Report written to: {REPORT_PATH}") + + +# ───────────────────────────────────────────── +# Human-readable audit report +# ───────────────────────────────────────────── + +def _source_to_page_link(path_str): + """Return a markdown link like [/docs/foo/bar](https://docs.ethswarm.org/docs/foo/bar).""" + try: + rel = Path(path_str).relative_to(DOCS_DIR) + except ValueError: + return path_str + url_path = str(rel).replace('\\', '/').replace('.mdx', '').replace('.md', '') + display = f"/docs/{url_path}" + url = f"https://{SITE_DOMAIN}/docs/{url_path}" + return f"[{display}]({url})" + + +def _fmt_sources(sources_list, max_show=2): + """Format a list of (file, text) source tuples into page link(s).""" + if not sources_list: + return "Unknown" + seen = [] + for f, _ in sources_list: + lnk = _source_to_page_link(f) + if lnk not in seen: + seen.append(lnk) + if len(seen) > max_show: + return ", ".join(seen[:max_show]) + f" _(+{len(seen)-max_show} more)_" + return ", ".join(seen) + + +def _fmt_instances(sources_list, docs_dir=None): + """ + Format a list of (file_path, link_text) tuples as bullet points separated + by
tags (for inline rendering in markdown table cells). + + Each bullet: β€’ "link text" β€” `relative/file/path.md` + """ + if not sources_list: + return "_unknown_" + if docs_dir is None: + docs_dir = DOCS_DIR + bullets = [] + for f, text in sources_list: + short = make_short_path(f, docs_dir).replace('|', '\\|') + safe_text = (text or '').strip().replace('|', '\\|')[:80] + if safe_text: + bullets.append(f'β€’ "{safe_text}" β€” `{short}`') + else: + bullets.append(f'β€’ `{short}`') + return "
".join(bullets) + + +def write_human_report( + md_broken, ext_results, ext_url_to_src, + md_files_checked, md_links_checked, md_total_files, + html_broken, html_files_checked, html_links_checked, html_total_files, + staged_replacements=None, +): + import datetime + today = datetime.date.today().isoformat() + + # Categorise external results + ext_404 = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_404} + ext_down = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_DOWN} + ext_redirect = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_REDIRECT} + ext_internal = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_INTERNAL} + ext_error = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_ERROR} + + # Self-site 404s (docs.ethswarm.org old paths) vs truly external 404s + self_404 = {**ext_internal} # checked against local build, not found + real_404 = {**ext_404} # HTTP 404 from external server + + _staged = staged_replacements or {} + + def _repl(url, res=None): + if url in _staged: + return _staged[url] + final = (res or {}).get('final_url') or '' + return final if final and final != url else '' + + n_dead = len(md_broken) + len(self_404) + len(real_404) + n_down = len(ext_down) + n_redirects = len(ext_redirect) + n_errors = len(ext_error) + n_total = n_dead + n_down + n_redirects + + lines = [] + lines.append("## Context\n") + lines.append( + f"Dead link audit of {SITE_DOMAIN} found **{n_total}** broken, down, or stale links. " + f"Audit date: {today}.\n" + ) + + # ── Dead Links (404) ────────────────────────────────────────────────────── + lines.append("---\n") + lines.append("## Dead Links (404)\n") + + if not md_broken and not self_404 and not real_404: + lines.append("_No dead links found._\n") + else: + lines.append("| Dead Link | Replacement URL | Status | Instances (Link Text β€” File) |") + lines.append("|---|---|---|---|") + + # Broken internal links (wrong file path or missing anchor) + for item in md_broken: + url = item['link_url'].replace('|', '\\|') + reason = item['reason'].replace('|', '\\|') + instances = _fmt_instances([(item['source'], item.get('link_text', ''))]) + lines.append(f"| `{url}` | | **Broken** β€” {reason} | {instances} |") + + # Self-site 404s (old docs.ethswarm.org paths not in local build) + for url, _res in sorted(self_404.items()): + instances = _fmt_instances(ext_url_to_src.get(url, [])) + lines.append(f"| {url} | | **404** β€” not found in local build (old path?) | {instances} |") + + # External 404s + for url, res in sorted(real_404.items()): + instances = _fmt_instances(ext_url_to_src.get(url, [])) + repl = _repl(url, res) + lines.append(f"| {url} | {repl} | **404** | {instances} |") + + lines.append("") + + # ── Forbidden / Down ───────────────────────────────────────────────────── + lines.append("---\n") + lines.append("## Forbidden / Down\n") + + if not ext_down: + lines.append("_No unreachable links._\n") + else: + lines.append("| Dead Link | Replacement URL | Status | Instances (Link Text β€” File) |") + lines.append("|---|---|---|---|") + for url, res in sorted(ext_down.items()): + instances = _fmt_instances(ext_url_to_src.get(url, [])) + err = res.get('error_msg') or 'connection failed' + repl = _repl(url, res) + # Simplify error messages + if 'DNS' in err or 'getaddrinfo' in err.lower(): + status = "**DNS failure** β€” domain not found" + elif 'ECONNREFUSED' in err or 'Connection refused' in err: + status = "**ECONNREFUSED** β€” server down" + elif 'timed out' in err.lower() or 'timeout' in err.lower(): + status = "**Timeout** β€” server unresponsive" + elif 'SSL' in err or 'ssl' in err: + status = "**SSL error** β€” handshake failure" + else: + status = f"**Down** β€” {err[:80]}" + lines.append(f"| {url} | {repl} | {status} | {instances} |") + + lines.append("") + + # ── Stale Redirects ─────────────────────────────────────────────────────── + lines.append("---\n") + lines.append("## Stale Redirects (Should Update)\n") + + if not ext_redirect: + lines.append("_No stale redirects._\n") + else: + lines.append("| Old Link | Redirects To | Instances (Link Text β€” File) |") + lines.append("|---|---|---|") + for url, res in sorted(ext_redirect.items()): + instances = _fmt_instances(ext_url_to_src.get(url, [])) + repl = _repl(url, res) + lines.append(f"| {url} | {repl} | {instances} |") + + lines.append("") + + # ── Errors / Timeouts ──────────────────────────────────────────────────── + if ext_error: + lines.append("---\n") + lines.append("## Check Errors (timeout / blocked)\n") + lines.append("_These URLs could not be verified β€” check manually._\n") + lines.append("| URL | Error | Instances (Link Text β€” File) |") + lines.append("|---|---|---|") + for url, res in sorted(ext_error.items()): + instances = _fmt_instances(ext_url_to_src.get(url, [])) + err = res.get('error_msg') or '' + lines.append(f"| {url} | {err} | {instances} |") + lines.append("") + + # ── Summary ─────────────────────────────────────────────────────────────── + lines.append("---\n") + lines.append("## Summary\n") + lines.append(f"- **Broken internal links:** {len(md_broken)}") + lines.append(f"- **Hard 404s (external):** {len(real_404) + len(self_404)}") + lines.append(f"- **Forbidden / Down:** {n_down}") + lines.append(f"- **Stale redirects:** {n_redirects}") + if ext_error: + lines.append(f"- **Check errors (unverified):** {n_errors}") + lines.append(f"- **Total actionable:** {n_total}") + lines.append("") + + # ── Priority ───────────────────────────────────────────────────────────── + lines.append("---\n") + lines.append("## Priority\n") + priority = [] + if md_broken: + priority.append(f"1. Fix {len(md_broken)} broken internal links (wrong paths / missing anchors)") + if self_404: + priority.append(f"{len(priority)+1}. Update {len(self_404)} old self-referential `{SITE_DOMAIN}` path(s) to current URLs") + if real_404: + priority.append(f"{len(priority)+1}. Remove or replace {len(real_404)} dead external link(s) (HTTP 404)") + if ext_down: + priority.append(f"{len(priority)+1}. Evaluate {len(ext_down)} down/refused server link(s) β€” remove or replace") + if ext_redirect: + priority.append(f"{len(priority)+1}. Update {len(ext_redirect)} stale redirect(s) to their final URL") + if ext_error: + priority.append(f"{len(priority)+1}. Manually verify {len(ext_error)} URL(s) that returned errors during check") + for item in priority: + lines.append(item) + lines.append("") + + HUMAN_REPORT_PATH.parent.mkdir(parents=True, exist_ok=True) + HUMAN_REPORT_PATH.write_text('\n'.join(lines), encoding='utf-8') + print(f"Human report written to: {HUMAN_REPORT_PATH}") + + +# ───────────────────────────────────────────── +# Staged-changes URL replacement map +# ───────────────────────────────────────────── + +def get_staged_url_replacements(): + """ + Parse `git diff --cached` to find URL replacements in staged changes. + Within each diff hunk, URLs on removed lines (-) are matched to URLs on + added lines (+) in order. Returns {old_url: new_url}. + """ + url_re = re.compile(r'https?://[^\s\])"\'<>`\\]+') + try: + result = subprocess.run( + ['git', 'diff', '--cached', '--unified=0'], + cwd=str(PROJECT_DIR), + capture_output=True, text=True, + ) + if result.returncode != 0 or not result.stdout: + return {} + except Exception: + return {} + + replacements = {} + removed, added = [], [] + + def _flush(): + removed_set = set(removed) + added_set = set(added) + gone = [u for u in removed if u not in added_set] + new = [u for u in added if u not in removed_set] + if gone and new: + for old, new_url in zip(gone, new): + replacements[old] = new_url + + for line in result.stdout.splitlines(): + if line.startswith(('diff --git', 'index ', '--- ', '+++ ')): + continue + if line.startswith('@@'): + _flush() + removed, added = [], [] + elif line.startswith('-'): + removed.extend(url_re.findall(line[1:])) + elif line.startswith('+'): + added.extend(url_re.findall(line[1:])) + _flush() + return replacements + + +# ───────────────────────────────────────────── +# Build helper +# ───────────────────────────────────────────── + +def _build_is_outdated(): + """ + Return True if any source file in docs/, static/, or key config files + was modified more recently than the build directory itself. + """ + try: + build_mtime = BUILD_DIR.stat().st_mtime + except FileNotFoundError: + return True # no build at all + + watch_dirs = [DOCS_DIR, STATIC_DIR] + watch_files = [ + PROJECT_DIR / "docusaurus.config.mjs", + PROJECT_DIR / "sidebars.js", + ] + + for d in watch_dirs: + if d.exists(): + for f in d.rglob("*"): + if f.is_file() and f.stat().st_mtime > build_mtime: + return True + + for f in watch_files: + if f.exists() and f.stat().st_mtime > build_mtime: + return True + + return False + + +def trigger_build(): + """ + Ensure a current build exists before running local checks. + + Behaviour: + β€’ No build found β†’ build immediately, no prompt needed. + β€’ Build found, up to date β†’ ask permission to overwrite. + β€’ Build found, outdated β†’ warn user, ask if they want to rebuild. + + Returns True if the build is ready to use, False on build failure or abort. + """ + if not BUILD_DIR.exists(): + print("\nNo existing build found β€” running: npm run build") + print("-" * 40) + result = subprocess.run([_NPM, 'run', 'build'], cwd=str(PROJECT_DIR)) + print("-" * 40) + if result.returncode != 0: + print("ERROR: Build failed (see output above).", file=sys.stderr) + return False + print("Build complete.\n") + return True + + # Build exists β€” check freshness + outdated = _build_is_outdated() + if outdated: + print(f"\nWARNING: The existing build at {BUILD_DIR} is outdated") + print(" (source files have changed since it was last built).") + prompt = "Rebuild now to get accurate results? This will overwrite it. [Y/n]: " + else: + print(f"\nAn existing build was found at: {BUILD_DIR} (appears up to date).") + prompt = "Rebuild now anyway? This will overwrite it. [y/N]: " + + try: + resp = input(prompt).strip().lower() + except (EOFError, KeyboardInterrupt): + print() + resp = '' + + # For outdated builds default is YES; for current builds default is NO + if outdated: + do_build = resp not in ('n', 'no') + else: + do_build = resp in ('y', 'yes') + + if not do_build: + if outdated: + print("Skipping rebuild β€” results may not reflect latest changes.\n") + else: + print("Skipping rebuild β€” using existing build.\n") + return True + + print("\nRunning: npm run build") + print("-" * 40) + result = subprocess.run([_NPM, 'run', 'build'], cwd=str(PROJECT_DIR)) + print("-" * 40) + if result.returncode != 0: + print("ERROR: Build failed (see output above).", file=sys.stderr) + return False + print("Build complete.\n") + return True + + +# ───────────────────────────────────────────── +# Main +# ───────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser( + description='Bee-docs link checker', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=( + "Modes:\n" + " local β€” build the site locally and check source docs + build output\n" + " live β€” fetch the live site at docs.ethswarm.org and check all links\n" + ), + ) + parser.add_argument( + '--mode', choices=['local', 'live'], default=None, + help='Check mode: "local" (build + source check) or "live" (live site crawl). ' + 'If omitted you will be prompted.', + ) + parser.add_argument('--no-external', action='store_true', + help='(local mode only) Skip external URL checking') + parser.add_argument('--threads', type=int, default=EXT_THREADS, + help=f'Concurrent HTTP threads (default: {EXT_THREADS})') + args = parser.parse_args() + + # ── Mode selection ── + mode = args.mode + if mode is None: + print("=== Bee-docs Link Checker ===\n") + print("Which site do you want to check?") + print(" 1) local β€” build locally and check source docs + build output") + print(" 2) live β€” fetch the live site at docs.ethswarm.org\n") + try: + choice = input("Enter 1 or 2 [default: 1]: ").strip() + except (EOFError, KeyboardInterrupt): + print() + choice = '1' + mode = 'live' if choice == '2' else 'local' + print() + + # ── Live mode: delegate to check_live_links.py ── + if mode == 'live': + live_script = Path(__file__).parent / 'check_live_links.py' + if not live_script.exists(): + print(f"ERROR: {live_script} not found.", file=sys.stderr) + sys.exit(1) + cmd = [sys.executable, str(live_script), '--threads', str(args.threads)] + print(f"Running live checker: {' '.join(cmd)}\n") + result = subprocess.run(cmd) + sys.exit(result.returncode) + + # ── Local mode ── + check_ext = not args.no_external + + print("=== Bee-docs Link Checker β€” Local Mode ===") + print(f"Docs dir : {DOCS_DIR}") + print(f"Build dir : {BUILD_DIR}") + print(f"External : {'enabled' if check_ext else 'disabled (--no-external)'}") + print() + + if not DOCS_DIR.exists(): + print(f"ERROR: Docs dir not found: {DOCS_DIR}") + sys.exit(1) + + # Always trigger a build for local mode + if not trigger_build(): + sys.exit(1) + + print("Scanning source docs (internal links)...") + md_broken, ext_url_to_src, md_files, md_links, md_total = check_markdown_files(check_ext) + print(f" Files: {md_files}/{md_total}, Links: {md_links}, Broken internal: {len(md_broken)}") + print(f" Unique external URLs collected: {len(ext_url_to_src)}") + + ext_results = {} + if check_ext and ext_url_to_src: + print(f"\nChecking {len(ext_url_to_src)} external URLs ({args.threads} threads)...") + ext_results = check_external_urls_threaded(ext_url_to_src, threads=args.threads) + ok = sum(1 for r in ext_results.values() if r['status'] == EXT_STATUS_OK) + redirects = sum(1 for r in ext_results.values() if r['status'] == EXT_STATUS_REDIRECT) + not_found = sum(1 for r in ext_results.values() if r['status'] in (EXT_STATUS_404, EXT_STATUS_INTERNAL)) + down = sum(1 for r in ext_results.values() if r['status'] == EXT_STATUS_DOWN) + errors = sum(1 for r in ext_results.values() if r['status'] == EXT_STATUS_ERROR) + print(f" OK: {ok} Redirect: {redirects} 404: {not_found} Down: {down} Error: {errors}") + + html_broken = [] + html_files = html_links = html_total = 0 + if BUILD_DIR.exists(): + print("\nChecking build output (HTML internal links)...") + html_broken, html_files, html_links, html_total = check_html_files() + print(f" Files: {html_files}/{html_total}, Links: {html_links}, Broken: {len(html_broken)}") + + staged = get_staged_url_replacements() + if staged: + print(f"\nFound {len(staged)} staged URL replacement(s) from git diff.") + + print("\nWriting report...") + write_report( + md_broken, ext_results, ext_url_to_src, + md_files, md_links, md_total, + html_broken, html_files, html_links, html_total, + staged_replacements=staged, + ) + write_human_report( + md_broken, ext_results, ext_url_to_src, + md_files, md_links, md_total, + html_broken, html_files, html_links, html_total, + staged_replacements=staged, + ) + + +if __name__ == '__main__': + main() diff --git a/scripts/check_live_links.py b/scripts/check_live_links.py new file mode 100644 index 00000000..b22334c5 --- /dev/null +++ b/scripts/check_live_links.py @@ -0,0 +1,689 @@ +#!/usr/bin/env python3 +""" +Live site link checker for docs.ethswarm.org. + +Fetches all pages listed in the sitemap, extracts every link, +then checks each link with explicit redirect handling (no auto-following). + +Usage: + python scripts/check_live_links.py [--threads N] [--max-pages N] + npm run check:links (then select live mode) + +Output: + .claude/live_links_audit.md β€” human-readable report +""" + +import re +import sys +import time +import queue +import socket +import threading +import http.client +import subprocess +import xml.etree.ElementTree as ET +import argparse +import datetime +from html.parser import HTMLParser +from pathlib import Path +from urllib.parse import urlparse, urljoin, unquote +from urllib.request import Request, urlopen, HTTPRedirectHandler, build_opener +from urllib.error import URLError, HTTPError +from collections import defaultdict + +# ───────────────────────────────────────────── +# Configuration +# ───────────────────────────────────────────── + +SITE_BASE = "https://docs.ethswarm.org" +SITEMAP_URL = f"{SITE_BASE}/sitemap.xml" +PROJECT_DIR = Path(__file__).resolve().parent.parent +REPORT_PATH = PROJECT_DIR / "link-reports/live_links_audit.md" + +EXT_TIMEOUT = 15 # seconds per HTTP request +EXT_THREADS = 8 # concurrent URL checkers +EXT_DELAY = 0.05 # seconds between requests per thread + +USER_AGENT = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/122.0 Safari/537.36 bee-docs-live-checker/1.0" +) + +IGNORE_SCHEMES = ("mailto:", "javascript:", "tel:", "ftp:", "data:", "#") +IGNORE_HOSTS = ("localhost", "127.0.0.1", "192.168.", "10.0.", "0.0.0.0") +# Hostnames that end with these suffixes are placeholder/example URLs in docs +IGNORE_HOST_SUFFIXES = (".example", ".local", ".invalid", ".test") + +# URL substrings to silently ignore β€” systematic redirects that aren't actionable doc fixes. +# e.g. every page has an "Edit this page" link using the old GitHub repo name. +IGNORE_URL_PATTERNS = ( + "github.com/ethersphere/docs.github.io", # "Edit this page" links using old repo name +) + +# Hostnames to ignore because they are example/template values in documentation +IGNORE_EXAMPLE_HOSTS = ( + "yourname.eth.limo", + "yourname.bzz.link", + "bee-1", # example service hostname in docker/gateway examples +) + +EXT_STATUS_OK = 'ok' +EXT_STATUS_404 = '404' +EXT_STATUS_DOWN = 'down' +EXT_STATUS_REDIRECT = 'redirect' +EXT_STATUS_ERROR = 'error' + + +# ───────────────────────────────────────────── +# HTTP helpers (explicit redirect handling) +# ───────────────────────────────────────────── + +class _NoFollowRedirectHandler(HTTPRedirectHandler): + def redirect_request(self, req, fp, code, msg, headers, newurl): + return None + + +def _build_no_redirect_opener(): + return build_opener(_NoFollowRedirectHandler()) + + +def _fetch(url, method='HEAD', follow_redirects=False, timeout=EXT_TIMEOUT): + """ + Single HTTP request. + Returns (status_code, final_url, location_header, error_str). + """ + headers = { + 'User-Agent': USER_AGENT, + 'Accept': 'text/html,application/xhtml+xml,*/*;q=0.8', + } + try: + req = Request(url, headers=headers, method=method) + if follow_redirects: + with urlopen(req, timeout=timeout) as resp: + return resp.status, resp.url, None, None + else: + opener = _build_no_redirect_opener() + with opener.open(req, timeout=timeout) as resp: + return resp.status, url, resp.headers.get('Location'), None + except HTTPError as e: + loc = e.headers.get('Location') if hasattr(e, 'headers') and e.headers else None + return e.code, url, loc, None + except (URLError, socket.timeout, socket.error, ConnectionRefusedError, + http.client.RemoteDisconnected, http.client.IncompleteRead) as e: + return None, url, None, str(e) + except Exception as e: + return None, url, None, f'{type(e).__name__}: {str(e)[:120]}' + + +def _classify_err(result, err): + if 'ECONNREFUSED' in err or 'Connection refused' in err: + result.update(status=EXT_STATUS_DOWN, error_msg='ECONNREFUSED β€” server down') + elif ('Name or service not known' in err or 'getaddrinfo' in err + or 'nodename' in err.lower() or 'No address' in err): + result.update(status=EXT_STATUS_DOWN, error_msg='DNS resolution failed') + elif 'timed out' in err.lower() or 'timeout' in err.lower(): + result.update(status=EXT_STATUS_DOWN, error_msg='Connection timed out') + elif 'SSL' in err or 'ssl' in err: + result.update(status=EXT_STATUS_DOWN, error_msg=f'SSL error: {err[:80]}') + else: + result.update(status=EXT_STATUS_DOWN, error_msg=f'Connection error: {err[:80]}') + return result + + +def _urls_differ(original, final): + if not final or original == final: + return False + o, f = urlparse(original), urlparse(final) + op, fp = o.path.rstrip('/'), f.path.rstrip('/') + if o.netloc == f.netloc and op == fp and o.query == f.query: + return False + if o.netloc == f.netloc and op == fp and o.scheme == 'http' and f.scheme == 'https': + return False + return True + + +def _check_dest(dest_url): + """Follow redirect destination and verify it returns 200.""" + code, final, _, err = _fetch(dest_url, method='HEAD', follow_redirects=True) + if err: + return None, dest_url, err + if code in (403, 405): + code, final, _, err = _fetch(dest_url, method='GET', follow_redirects=True) + if err: + return None, dest_url, err + return code, final or dest_url, None + + +def check_url(url): + """ + Check a single URL with explicit redirect handling. + Returns dict: {url, status, http_code, final_url, error_msg} + """ + result = dict(url=url, status=EXT_STATUS_ERROR, + http_code=None, final_url=None, error_msg=None) + + # Step 1: HEAD without following redirects + code, _, location, err = _fetch(url, method='HEAD', follow_redirects=False) + if err: + return _classify_err(result, err) + + # HEAD rejected β†’ retry with GET + if code in (403, 405): + code, _, location, err = _fetch(url, method='GET', follow_redirects=False) + if err: + return _classify_err(result, err) + if code in (403, 405): + result.update(status=EXT_STATUS_ERROR, http_code=code, + error_msg=f'HTTP {code} (GET retry)', final_url=url) + return result + + result['http_code'] = code + + if code is None: + result['status'] = EXT_STATUS_ERROR + return result + if code == 200: + result.update(status=EXT_STATUS_OK, final_url=url) + elif code == 404: + result.update(status=EXT_STATUS_404, error_msg='HTTP 404', final_url=url) + elif code in (301, 302, 303, 307, 308): + dest = location or url + if dest and not dest.startswith('http'): + p = urlparse(url) + dest = f"{p.scheme}://{p.netloc}{dest}" + dest_code, dest_final, dest_err = _check_dest(dest) + if dest_err: + result.update(status=EXT_STATUS_DOWN, + error_msg=f"Redirect to {dest!r} failed: {dest_err[:80]}", + final_url=dest) + elif dest_code is None: + result.update(status=EXT_STATUS_DOWN, + error_msg=f"Redirect destination unreachable", + final_url=dest) + elif dest_code == 200: + if _urls_differ(url, dest_final): + result.update(status=EXT_STATUS_REDIRECT, final_url=dest_final) + else: + result.update(status=EXT_STATUS_OK, final_url=dest_final) + elif dest_code == 404: + result.update(status=EXT_STATUS_404, + error_msg=f"Redirect target returned 404", + final_url=dest) + else: + result.update(status=EXT_STATUS_ERROR, + error_msg=f"Redirect target returned HTTP {dest_code}", + final_url=dest) + elif 200 <= code < 300: + result.update(status=EXT_STATUS_OK, final_url=url) + else: + result.update(status=EXT_STATUS_ERROR, + error_msg=f'HTTP {code}', final_url=url) + + return result + + +# ───────────────────────────────────────────── +# Sitemap fetcher +# ───────────────────────────────────────────── + +def fetch_sitemap_urls(sitemap_url): + """Fetch sitemap.xml and return list of page URLs.""" + print(f"Fetching sitemap: {sitemap_url}") + try: + req = Request(sitemap_url, headers={'User-Agent': USER_AGENT}) + with urlopen(req, timeout=30) as resp: + xml_data = resp.read() + except Exception as e: + print(f"ERROR fetching sitemap: {e}", file=sys.stderr) + return [] + + urls = [] + try: + root = ET.fromstring(xml_data) + # Handle namespace + ns = '' + if root.tag.startswith('{'): + ns = root.tag.split('}')[0] + '}' + for loc in root.iter(f'{ns}loc'): + u = loc.text.strip() if loc.text else '' + if u: + urls.append(u) + except ET.ParseError as e: + print(f"ERROR parsing sitemap XML: {e}", file=sys.stderr) + + print(f" Found {len(urls)} URLs in sitemap") + return urls + + +# ───────────────────────────────────────────── +# HTML link extractor +# ───────────────────────────────────────────── + +class LinkExtractor(HTMLParser): + def __init__(self): + super().__init__() + self.links_with_text = [] # list of (href, link_text) from tags + self.text_chunks = [] # all visible text (including code blocks) for bare URL extraction + self._skip_depth = 0 # depth inside