Skip to content

rework github _open() implementation to support LFS #1810

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 17, 2025
42 changes: 28 additions & 14 deletions fsspec/implementations/github.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import requests
import base64

import fsspec
import requests

from ..spec import AbstractFileSystem
from ..utils import infer_storage_options
Expand Down Expand Up @@ -36,7 +36,7 @@ class GithubFileSystem(AbstractFileSystem):
"""

url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
protocol = "github"
timeout = (60, 60) # connect, read timeouts

Expand Down Expand Up @@ -219,21 +219,35 @@ def _open(
):
if mode != "rb":
raise NotImplementedError
url = self.rurl.format(

# construct a url to hit the GitHub API's repo contents API
url = self.content_url.format(
org=self.org, repo=self.repo, path=path, sha=sha or self.root
)

# make a request to this API, and parse the response as JSON
r = requests.get(url, timeout=self.timeout, **self.kw)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
content_json = r.json()

# if the response's content key is not empty, try to parse it as base64
if content_json["content"]:
content = base64.b64decode(content_json["content"])

# as long as the content does not start with the string
# "version https://git-lfs.github.com/"
# then it is probably not a git-lfs pointer and we can just return
# the content directly
if not content.startswith(b"version https://git-lfs.github.com/"):
return MemoryFile(None, None, content)

# we land here if the content was not present in the first response
# (regular file over 1MB or git-lfs tracked file)
# in this case, we get the content from the download_url
r = requests.get(content_json["download_url"], timeout=self.timeout, **self.kw)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
return MemoryFile(None, None, r.content)

def cat(self, path, recursive=False, on_error="raise", **kwargs):
paths = self.expand_path(path, recursive=recursive)
urls = [
self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root)
for u, sh in paths
]
fs = fsspec.filesystem("http")
data = fs.cat(urls, on_error="return")
return {u: v for ((k, v), u) in zip(data.items(), urls)}
41 changes: 41 additions & 0 deletions fsspec/implementations/tests/test_github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import fsspec


def test_github_open_small_file():
# test opening a small file <1 MB
with fsspec.open("github://mwaskom:seaborn-data@4e06bf0/penguins.csv") as f:
assert f.readline().startswith(b"species,island")


def test_github_open_large_file():
# test opening a large file >1 MB
with fsspec.open("github://mwaskom:seaborn-data@83bfba7/brain_networks.csv") as f:
assert f.readline().startswith(b"network,1,1,2,2")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: this will download the whole file in one shot, even though it's "large". Do you think we can generate a buffered file (like a HTTPFile), given that we expect the remote to accept HTTP range requests?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This sounds like a great idea - thank you for mentioning this. I will rework this and then rewrite this test to use a range request to avoid fetching the whole file over the network.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have thoughts on if it would also be important to introduce other concepts from the http implementation such as support for async, reuse of an aiohttp ClientSession for the multiple calls to the GitHub API, etc.? Or would it be fine to just use HTTPFile and ensure that range requests are supported, but otherwise not reach for full feature parity with the http implementation in this first PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess maybe my question may not make sense to ask if the relevant async support is already "enabled" just by virtue of the asynchronous kwarg on the HTTPFile constructor. I should probably take a closer look before asking this question 😅 .

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not think about the async-ness. We probably don't want this for github, since there is a fairly onerous API rate limit, even if you come with a developer token.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a751042 I tweaked _open() to return an HTTPFile wrapping the download_url in cases where the initial response from the GitHub API does not include the file content already. I believe this should enable streaming and range queries.



def test_github_open_lfs_file():
# test opening a git-lfs tracked file
with fsspec.open(
"github://cBioPortal:datahub@55cd360"
"/public/acc_2019/data_gene_panel_matrix.txt",
) as f:
assert f.readline().startswith(b"SAMPLE_ID\tmutations")


def test_github_cat():
# test using cat to fetch the content of multiple files
fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
paths = ["penguins.csv", "mpg.csv"]
cat_result = fs.cat(paths)
assert set(cat_result.keys()) == {"penguins.csv", "mpg.csv"}
assert cat_result["penguins.csv"].startswith(b"species,island")
assert cat_result["mpg.csv"].startswith(b"mpg,cylinders")


def test_github_ls():
# test using ls to list the files in a resository
fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
ls_result = set(fs.ls(""))
expected = {"brain_networks.csv", "mpg.csv", "penguins.csv", "README.md", "raw"}
# check if the result is a subset of the expected files
assert expected.issubset(ls_result)