Skip to content

rework github _open() implementation to support LFS #1810

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 17, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 33 additions & 16 deletions fsspec/implementations/github.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import requests
import base64

import fsspec
import requests

from ..spec import AbstractFileSystem
from ..utils import infer_storage_options
from .http import HTTPFileSystem
from .memory import MemoryFile

# TODO: add GIST backend, would be very similar
Expand Down Expand Up @@ -36,7 +37,7 @@ class GithubFileSystem(AbstractFileSystem):
"""

url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
protocol = "github"
timeout = (60, 60) # connect, read timeouts

Expand All @@ -63,6 +64,7 @@ def __init__(

self.root = sha
self.ls("")
self.http_fs = HTTPFileSystem(**kwargs)

@property
def kw(self):
Expand Down Expand Up @@ -212,28 +214,43 @@ def _open(
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
sha=None,
**kwargs,
):
if mode != "rb":
raise NotImplementedError
url = self.rurl.format(

# construct a url to hit the GitHub API's repo contents API
url = self.content_url.format(
org=self.org, repo=self.repo, path=path, sha=sha or self.root
)

# make a request to this API, and parse the response as JSON
r = requests.get(url, timeout=self.timeout, **self.kw)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
return MemoryFile(None, None, r.content)

def cat(self, path, recursive=False, on_error="raise", **kwargs):
paths = self.expand_path(path, recursive=recursive)
urls = [
self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root)
for u, sh in paths
]
fs = fsspec.filesystem("http")
data = fs.cat(urls, on_error="return")
return {u: v for ((k, v), u) in zip(data.items(), urls)}
content_json = r.json()

# if the response's content key is not empty, try to parse it as base64
if content_json["content"]:
content = base64.b64decode(content_json["content"])

# as long as the content does not start with the string
# "version https://git-lfs.github.com/"
# then it is probably not a git-lfs pointer and we can just return
# the content directly
if not content.startswith(b"version https://git-lfs.github.com/"):
return MemoryFile(None, None, content)

# we land here if the content was not present in the first response
# (regular file over 1MB or git-lfs tracked file)
# in this case, we get let the HTTPFileSystem handle the download
return self.http_fs.open(
content_json["download_url"],
mode=mode,
block_size=block_size,
cache_options=cache_options,
**kwargs,
)
48 changes: 48 additions & 0 deletions fsspec/implementations/tests/test_github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import fsspec


def test_github_open_small_file():
# test opening a small file <1 MB
with fsspec.open("github://mwaskom:seaborn-data@4e06bf0/penguins.csv") as f:
assert f.readline().startswith(b"species,island")


def test_github_open_large_file():
# test opening a large file >1 MB
# use block_size=0 to get a streaming interface to the file, ensuring that
# we fetch only the parts we need instead of downloading the full file all
# at once
with fsspec.open(
"github://mwaskom:seaborn-data@83bfba7/brain_networks.csv", block_size=0
) as f:
# read only the first 20 bytes of the file
assert f.read(20) == b"network,1,1,2,2,3,3,"


def test_github_open_lfs_file():
# test opening a git-lfs tracked file
with fsspec.open(
"github://cBioPortal:datahub@55cd360"
"/public/acc_2019/data_gene_panel_matrix.txt",
block_size=0,
) as f:
assert f.read(19) == b"SAMPLE_ID\tmutations"


def test_github_cat():
# test using cat to fetch the content of multiple files
fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
paths = ["penguins.csv", "mpg.csv"]
cat_result = fs.cat(paths)
assert set(cat_result.keys()) == {"penguins.csv", "mpg.csv"}
assert cat_result["penguins.csv"].startswith(b"species,island")
assert cat_result["mpg.csv"].startswith(b"mpg,cylinders")


def test_github_ls():
# test using ls to list the files in a resository
fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
ls_result = set(fs.ls(""))
expected = {"brain_networks.csv", "mpg.csv", "penguins.csv", "README.md", "raw"}
# check if the result is a subset of the expected files
assert expected.issubset(ls_result)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ full = [
fuse = ["fusepy"]
gcs = ["gcsfs"]
git = ["pygit2"]
github = ["requests"]
github = ["fsspec[http]", "requests"]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added the http extra here since now the github backend depends on HTTPFileSystem which requires aiohttp.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can a package depend on itself? Maybe list aiohttp here instead.

But actually it's optional - for normal small files, you don't need it. Do you think there's an easy way to defer the import and use of HTTPFileSystem (+ appropriate error message if it fails)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will defer to you on this. In my experience, I have used this syntax to specify that one extra depends on another extra. But in this case, the http extra has only one dependency (aiohttp), and new deps are probably added to that extra very rarely, so the degree of "extra repetition" and "risk of forgetting to add any new http dep to the github extra too" is low.

But you also bring up a good point that we could just make it optional by deferring the import.

I took a stab at this in 8e7fb56 - let me know what you think about this approach!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One quirk with 8e7fb56 is that it does not create an HTTPFileSystem at the instance level to be re-used for all file downloads from this GithubFileSystem. Instead it creates a new HTTPFileSystem for every large file to be downloaded.

It's possible to rework this so that self.http_fs is set to either None or HTTPFileSystem(**kwargs) depending on whether or not the import succeeds, and then we only raise the exception when a large-file download is attempted. Let me know if this would be preferred.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went ahead and tried this out in 65e09cf - let me know what you think!

gs = ["gcsfs"]
gui = ["panel"]
hdfs = ["pyarrow >= 1"]
Expand Down