Ferocious0xide commited on
Commit
addb0c2
·
verified ·
1 Parent(s): b43ab58

Update tools/arxiv_tool.py with API specs from Arxiv

Browse files
Files changed (1) hide show
  1. tools/arxiv_tool.py +44 -20
tools/arxiv_tool.py CHANGED
@@ -1,4 +1,5 @@
1
- import arxiv
 
2
  from datetime import datetime, timedelta
3
  import json
4
  import os
@@ -13,29 +14,51 @@ class ArxivSearchTool(Tool):
13
 
14
  def __call__(self, query: str = "artificial intelligence",
15
  max_results: int = 50) -> List[Dict]:
 
 
 
 
 
 
 
 
 
16
  try:
17
- # Configure the search client
18
- client = arxiv.Client()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- # Create the search query
21
- search = arxiv.Search(
22
- query=query,
23
- max_results=max_results,
24
- sort_by=arxiv.SortCriterion.SubmittedDate
25
- )
26
 
27
- # Get results
28
  results = []
29
- for paper in client.results(search):
 
30
  result = {
31
- 'title': paper.title,
32
- 'authors': [str(author) for author in paper.authors],
33
- 'summary': paper.summary,
34
- 'published': paper.published.strftime("%Y-%m-%d"),
35
- 'pdf_url': paper.pdf_url,
36
- 'entry_id': paper.entry_id,
37
- 'primary_category': paper.primary_category,
38
- 'categories': paper.categories
 
39
  }
40
  results.append(result)
41
 
@@ -80,9 +103,10 @@ def save_daily_papers(output_dir: str = "daily_papers") -> List[Dict]:
80
  max_results=100
81
  )
82
 
 
83
  today_papers = [
84
  paper for paper in papers
85
- if paper.get('published') == today
86
  ]
87
 
88
  output_file = os.path.join(output_dir, f"ai_papers_{today}.json")
 
1
+ import urllib.request
2
+ import xml.etree.ElementTree as ET
3
  from datetime import datetime, timedelta
4
  import json
5
  import os
 
14
 
15
  def __call__(self, query: str = "artificial intelligence",
16
  max_results: int = 50) -> List[Dict]:
17
+ """Search ArXiv using their API.
18
+
19
+ Args:
20
+ query: Search query string
21
+ max_results: Maximum number of results to return
22
+
23
+ Returns:
24
+ List[Dict]: List of paper results with metadata
25
+ """
26
  try:
27
+ # Construct the API URL
28
+ base_url = 'http://export.arxiv.org/api/query?'
29
+ query_params = {
30
+ 'search_query': query,
31
+ 'start': 0,
32
+ 'max_results': max_results
33
+ }
34
+
35
+ # Create the full URL
36
+ url = base_url + urllib.parse.urlencode(query_params)
37
+
38
+ # Make the request
39
+ response = urllib.request.urlopen(url)
40
+ data = response.read().decode('utf-8')
41
+
42
+ # Parse the Atom XML response
43
+ root = ET.fromstring(data)
44
 
45
+ # Define the Atom namespace
46
+ ns = {'atom': 'http://www.w3.org/2005/Atom',
47
+ 'arxiv': 'http://arxiv.org/schemas/atom'}
 
 
 
48
 
 
49
  results = []
50
+ for entry in root.findall('atom:entry', ns):
51
+ # Extract paper details
52
  result = {
53
+ 'title': entry.find('atom:title', ns).text.strip(),
54
+ 'authors': [author.find('atom:name', ns).text
55
+ for author in entry.findall('atom:author', ns)],
56
+ 'summary': entry.find('atom:summary', ns).text.strip() if entry.find('atom:summary', ns) is not None else '',
57
+ 'published': entry.find('atom:published', ns).text.strip(),
58
+ 'id': entry.find('atom:id', ns).text.strip(),
59
+ 'pdf_url': next((link.get('href') for link in entry.findall('atom:link', ns)
60
+ if link.get('type') == 'application/pdf'), None),
61
+ 'categories': [cat.get('term') for cat in entry.findall('atom:category', ns)]
62
  }
63
  results.append(result)
64
 
 
103
  max_results=100
104
  )
105
 
106
+ # Filter for papers published today
107
  today_papers = [
108
  paper for paper in papers
109
+ if paper.get('published', '').startswith(today)
110
  ]
111
 
112
  output_file = os.path.join(output_dir, f"ai_papers_{today}.json")