-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathretrieve_papers.py
More file actions
63 lines (52 loc) · 1.98 KB
/
retrieve_papers.py
File metadata and controls
63 lines (52 loc) · 1.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pickle
import asyncio
import requests
from bots import BotRegister
from bs4 import BeautifulSoup
from omegaconf import OmegaConf
from summarize_arxiv import summarize
# Load cfg
cfg = OmegaConf.load('./config.yaml')
print(cfg)
bot = BotRegister().make_bot(cfg)
criteria = cfg["criteria"]
try:
seen: set = pickle.load(open("./seen.pkl", "rb"))
except:
seen = set()
# Loop through the titles and links and print them
async def main():
for lst in cfg["lists"]:
# Parse the HTML content using BeautifulSoup
url = f"https://arxiv.org/list/{lst}/pastweek?show=496"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# Find all the dt elements which contain the titles and links
titles = soup.find_all("dt")
descriptions = soup.find_all("dd")
for i, title in enumerate(titles):
href: str = None
title_text: str = None
links = title.findAll("a")
spans = descriptions[i].findAll("div", {"class": "list-title"})
title_text = spans[0].text.strip()
for link in links:
if link.get("href"):
href = link.get("href")
break
if href not in seen:
for kw in criteria:
if kw.lower() in title_text.lower():
print(f"Processing {title_text}")
title_link = "https://arxiv.org" + href
print(f"<b>{title_text}</b>\nLink: {title_link}\n")
print(href)
summary = summarize(href.split("/")[-1])
message = f"<b>{title_text}</b>\n\n{summary}\n\nLink: {title_link}"
print(message)
bot.send_message(text=message)
break
seen.add(href)
pickle.dump(seen, open("./seen.pkl", "wb"))
if __name__ == '__main__':
asyncio.run(main())