diff --git a/fsspec/implementations/github.py b/fsspec/implementations/github.py index 3650b8eba..b624ca522 100644 --- a/fsspec/implementations/github.py +++ b/fsspec/implementations/github.py @@ -1,6 +1,6 @@ -import requests +import base64 -import fsspec +import requests from ..spec import AbstractFileSystem from ..utils import infer_storage_options @@ -16,8 +16,10 @@ class GithubFileSystem(AbstractFileSystem): repository. You may specify a point in the repos history, by SHA, branch or tag (default is current master). - Given that code files tend to be small, and that github does not support - retrieving partial content, we always fetch whole files. + For files less than 1 MB in size, file content is returned directly in a + MemoryFile. For larger files, or for files tracked by git-lfs, file content + is returned as an HTTPFile wrapping the ``download_url`` provided by the + GitHub API. When using fsspec.open, allows URIs of the form: @@ -36,7 +38,7 @@ class GithubFileSystem(AbstractFileSystem): """ url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}" - rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}" + content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}" protocol = "github" timeout = (60, 60) # connect, read timeouts @@ -63,6 +65,12 @@ def __init__( self.root = sha self.ls("") + try: + from .http import HTTPFileSystem + + self.http_fs = HTTPFileSystem(**kwargs) + except ImportError: + self.http_fs = None @property def kw(self): @@ -212,28 +220,48 @@ def _open( path, mode="rb", block_size=None, - autocommit=True, cache_options=None, sha=None, **kwargs, ): if mode != "rb": raise NotImplementedError - url = self.rurl.format( + + # construct a url to hit the GitHub API's repo contents API + url = self.content_url.format( org=self.org, repo=self.repo, path=path, sha=sha or self.root ) + + # make a request to this API, and parse the response as JSON r = requests.get(url, timeout=self.timeout, **self.kw) if r.status_code == 404: raise FileNotFoundError(path) r.raise_for_status() - return MemoryFile(None, None, r.content) - - def cat(self, path, recursive=False, on_error="raise", **kwargs): - paths = self.expand_path(path, recursive=recursive) - urls = [ - self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root) - for u, sh in paths - ] - fs = fsspec.filesystem("http") - data = fs.cat(urls, on_error="return") - return {u: v for ((k, v), u) in zip(data.items(), urls)} + content_json = r.json() + + # if the response's content key is not empty, try to parse it as base64 + if content_json["content"]: + content = base64.b64decode(content_json["content"]) + + # as long as the content does not start with the string + # "version https://git-lfs.github.com/" + # then it is probably not a git-lfs pointer and we can just return + # the content directly + if not content.startswith(b"version https://git-lfs.github.com/"): + return MemoryFile(None, None, content) + + # we land here if the content was not present in the first response + # (regular file over 1MB or git-lfs tracked file) + # in this case, we get let the HTTPFileSystem handle the download + if self.http_fs is None: + raise ImportError( + "Please install fsspec[http] to access github files >1 MB " + "or git-lfs tracked files." + ) + return self.http_fs.open( + content_json["download_url"], + mode=mode, + block_size=block_size, + cache_options=cache_options, + **kwargs, + ) diff --git a/fsspec/implementations/tests/test_github.py b/fsspec/implementations/tests/test_github.py new file mode 100644 index 000000000..32df94f74 --- /dev/null +++ b/fsspec/implementations/tests/test_github.py @@ -0,0 +1,48 @@ +import fsspec + + +def test_github_open_small_file(): + # test opening a small file <1 MB + with fsspec.open("github://mwaskom:seaborn-data@4e06bf0/penguins.csv") as f: + assert f.readline().startswith(b"species,island") + + +def test_github_open_large_file(): + # test opening a large file >1 MB + # use block_size=0 to get a streaming interface to the file, ensuring that + # we fetch only the parts we need instead of downloading the full file all + # at once + with fsspec.open( + "github://mwaskom:seaborn-data@83bfba7/brain_networks.csv", block_size=0 + ) as f: + # read only the first 20 bytes of the file + assert f.read(20) == b"network,1,1,2,2,3,3," + + +def test_github_open_lfs_file(): + # test opening a git-lfs tracked file + with fsspec.open( + "github://cBioPortal:datahub@55cd360" + "/public/acc_2019/data_gene_panel_matrix.txt", + block_size=0, + ) as f: + assert f.read(19) == b"SAMPLE_ID\tmutations" + + +def test_github_cat(): + # test using cat to fetch the content of multiple files + fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data") + paths = ["penguins.csv", "mpg.csv"] + cat_result = fs.cat(paths) + assert set(cat_result.keys()) == {"penguins.csv", "mpg.csv"} + assert cat_result["penguins.csv"].startswith(b"species,island") + assert cat_result["mpg.csv"].startswith(b"mpg,cylinders") + + +def test_github_ls(): + # test using ls to list the files in a resository + fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data") + ls_result = set(fs.ls("")) + expected = {"brain_networks.csv", "mpg.csv", "penguins.csv", "README.md", "raw"} + # check if the result is a subset of the expected files + assert expected.issubset(ls_result)