-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetch.py
More file actions
100 lines (80 loc) · 2.98 KB
/
fetch.py
File metadata and controls
100 lines (80 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = []
# ///
"""Fetch and extract clean article content from any URL using the Diffbot Article API."""
import os
import sys
import json
import urllib.request
import urllib.parse
from typing import Dict, Any
def fetch_diffbot(url: str, timeout: int = 30) -> Dict[str, Any]:
"""Fetch and extract article content using Diffbot API."""
api_key = os.environ.get("DIFFBOT_API_KEY")
if not api_key:
print("Error: DIFFBOT_API_KEY environment variable is not set.", file=sys.stderr)
sys.exit(1)
base_url = "https://api.diffbot.com/v3/article"
params = {
"token": api_key,
"url": url
}
query_string = urllib.parse.urlencode(params)
request_url = f"{base_url}?{query_string}"
try:
req = urllib.request.Request(request_url)
req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
with urllib.request.urlopen(req, timeout=timeout) as response:
data = json.loads(response.read().decode('utf-8'))
if "error" in data:
print(f"Diffbot API Error: {data['error']}", file=sys.stderr)
sys.exit(1)
objects = data.get("objects", [])
if not objects:
print("Error: No objects returned from Diffbot API.", file=sys.stderr)
sys.exit(1)
title = objects[0].get("title")
if not title:
print(f"Error: No title found in response. Raw data: {json.dumps(data)[:200]}...", file=sys.stderr)
sys.exit(1)
texts = [obj.get("text", "") for obj in objects if obj.get("text")]
combined_text = "\n\n".join(texts)
return {
"title": title,
"text": combined_text,
"author": objects[0].get("author", ""),
"date": objects[0].get("date", ""),
"siteName": objects[0].get("siteName", "")
}
except urllib.error.HTTPError as e:
error_body = e.read().decode('utf-8')
print(f"HTTP Error {e.code}: {error_body}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error fetching from Diffbot: {str(e)}", file=sys.stderr)
sys.exit(1)
def main():
if len(sys.argv) < 2:
print("Usage: python fetch.py <url>")
sys.exit(1)
url = sys.argv[1]
if sys.platform == "win32":
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')
result = fetch_diffbot(url)
print(f"# {result['title']}\n")
meta = []
if result['author']:
meta.append(f"**Author:** {result['author']}")
if result['date']:
meta.append(f"**Date:** {result['date']}")
if result['siteName']:
meta.append(f"**Site:** {result['siteName']}")
if meta:
print(" | ".join(meta) + "\n")
print("---\n")
print(result['text'])
if __name__ == "__main__":
main()