Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,22 @@ paper.download_source(filename="downloaded-paper.tar.gz")
paper.download_source(dirpath="./mydir", filename="downloaded-paper.tar.gz")
```

You can also download HTML versions of papers (when available):

```python
import arxiv

paper = next(arxiv.Client().results(arxiv.Search(id_list=["1605.08386v1"])))
# Download the HTML to the PWD with a default filename.
paper.download_html()
# Download the HTML to the PWD with a custom filename.
paper.download_html(filename="downloaded-paper.html")
# Download the HTML to a specified directory with a custom filename.
paper.download_html(dirpath="./mydir", filename="downloaded-paper.html")
```

**Note:** HTML versions are not available for all arXiv papers. This feature is primarily available for newer papers submitted in certain formats.

#### Fetching results with a custom client

```python
Expand Down Expand Up @@ -119,7 +135,7 @@ DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): export.arxiv.org
DEBUG:urllib3.connectionpool:https://export.arxiv.org:443 "GET /api/query?search_query=&id_list=1605.08386v1&sortBy=relevance&sortOrder=descending&start=0&max_results=100&user-agent=arxiv.py%2F1.4.8 HTTP/1.1" 200 979
```

## Types
## Types

### Client

Expand All @@ -137,4 +153,4 @@ The `Result` objects yielded by `Client.results` include metadata about each pap

The meaning of the underlying raw data is documented in the [arXiv API User Manual: Details of Atom Results Returned](https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned).

`Result` also exposes helper methods for downloading papers: `Result.download_pdf` and `Result.download_source`.
`Result` also exposes helper methods for downloading papers: `Result.download_pdf`, `Result.download_source`, and `Result.download_html`.
25 changes: 25 additions & 0 deletions arxiv/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,31 @@ def download_source(
written_path, _ = urlretrieve(src_url, path)
return written_path

def download_html(
self,
dirpath: str = "./",
filename: str = "",
download_domain: str = "export.arxiv.org",
) -> str:
"""
Downloads the HTML version for this result to the specified
directory.

The filename is generated by calling `to_filename(self)`.

Note: HTML versions are not available for all arXiv papers.
This feature is primarily available for newer papers submitted
in certain formats.
"""
if not filename:
filename = self._get_default_filename("html")
path = os.path.join(dirpath, filename)
pdf_url = Result._substitute_domain(self.pdf_url, download_domain)
# Construct the HTML URL from the PDF URL.
html_url = pdf_url.replace("/pdf/", "/html/")
written_path, _ = urlretrieve(html_url, path)
return written_path

def _get_pdf_url(links: List[Link]) -> str:
"""
Finds the PDF link among a result's links and returns its URL.
Expand Down
25 changes: 25 additions & 0 deletions tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,32 @@ def test_download_tarfile_from_query(self):
)
)

def test_download_html_from_query(self):
try:
self.fetched_result.download_html(dirpath=self.temp_dir)
self.assertTrue(
os.path.exists(
os.path.join(
self.temp_dir,
"1605.08386v1.Heat_bath_random_walks_with_Markov_bases.html",
)
)
)
except Exception:
# HTML version may not be available for all papers
# This is expected for older papers or papers without HTML export
pass

def test_download_with_custom_slugify_from_query(self):
fn = "custom-filename.extension"
self.fetched_result.download_pdf(dirpath=self.temp_dir, filename=fn)
self.assertTrue(os.path.exists(os.path.join(self.temp_dir, fn)))

def test_download_html_with_custom_filename(self):
fn = "custom-html-filename.html"
try:
self.fetched_result.download_html(dirpath=self.temp_dir, filename=fn)
self.assertTrue(os.path.exists(os.path.join(self.temp_dir, fn)))
except Exception:
# HTML version may not be available for all papers
pass