aboutcode-org
diff --git a/‎requirements.txt
+7-2 b/‎requirements.txt
+7-2
diff --git a/‎vulnerabilities/data_source.py
+270-27 b/‎vulnerabilities/data_source.py
+270-27
diff --git a/‎vulnerabilities/models.py
+3-1 b/‎vulnerabilities/models.py
+3-1
@@ -1,5 +1,8 @@
+asgiref==3.2.7
 attrs==19.3.0
 beautifulsoup4==4.7.1
+cached-property==1.5.1
+cffi==1.14.0
 dephell-specifier==0.2.1
 dj-database-url==0.4.2
 Django==3.0.3
@@ -15,20 +18,22 @@ pluggy==0.13.1
 psycopg2==2.8.4
 py==1.8.0
 pycodestyle==2.5.0
+pycparser==2.20
+pygit2==1.2.0
 pyparsing==2.4.5
 pytest==5.3.2
 pytest-dependency==0.4.0
 pytest-django==3.7.0
 pytest-mock==1.13.0
+pytoml==0.1.21
 pytz==2019.3
 PyYAML==5.3
 saneyaml==0.4
+schema==0.7.1
 six==1.13.0
 soupsieve==1.9.5
 sqlparse==0.3.0
 tqdm==4.41.1
 wcwidth==0.1.7
 whitenoise==5.0.1
 zipp==0.6.0
-pytoml==0.1.21
-schema==0.7.1
 
@@ -20,80 +20,323 @@
 #  VulnerableCode is a free software code scanning tool from nexB Inc. and others.
 #  Visit https://github.com/nexB/vulnerablecode/ for support and download.
 
+import dataclasses
+import os
+import shutil
+import tempfile
 from datetime import datetime
+from pathlib import Path
 from typing import Any
 from typing import ContextManager
+from typing import List
 from typing import Mapping
 from typing import Optional
 from typing import Sequence
-import dataclasses
+from typing import Set
 
+import pygit2
 from packageurl import PackageURL
 
 
 @dataclasses.dataclass
+class Advisory:
+    """
+    This data class expresses the contract between data sources and the import runner.
+    Data sources are expected to be usable as context managers and generators, yielding batches of Advisory sequences.
+
+    NB: There are two representations for package URLs that are commonly used by code consuming this data class;
+        PackageURL objects and strings. As a convention, the former is referred to in variable names, etc. as
+        "package_urls" and the latter as "purls".
+    """
+    summary: str
+    impacted_package_urls: Sequence[PackageURL]
+    resolved_package_urls: Sequence[PackageURL] = dataclasses.field(default_factory=list)
+    references: Sequence[str] = dataclasses.field(default_factory=list)
+    cve_id: Optional[str] = None
+
+    @property
+    def impacted_purls(self) -> Set[str]:
+        return {str(p) for p in self.impacted_package_urls}
+
+    @property
+    def resolved_purls(self) -> Set[str]:
+        return {str(p) for p in self.resolved_package_urls}
+
+
+class InvalidConfigurationError(Exception):
+    pass
+
+
+@dataclasses.dataclass
+class DataSourceConfiguration:
+    batch_size: int
+
+
 class DataSource(ContextManager):
     """
     This class defines how importers consume advisories from a data source.
 
     It makes a distinction between newly added records since the last run and modified records. This allows the import
     logic to pick appropriate database operations.
     """
-    batch_size: int
-    cutoff_date: Optional[datetime] = None
-    config: Optional[Mapping[str, Any]] = dataclasses.field(default_factory=dict)
+
+    CONFIG_CLASS = DataSourceConfiguration
+
+    def __init__(
+            self,
+            batch_size: int,
+            last_run_date: Optional[datetime] = None,
+            cutoff_date: Optional[datetime] = None,
+            config: Optional[Mapping[str, Any]] = None,
+    ):
+        """
+        Create a DataSource instance.
+
+        :param batch_size: Maximum number of records to return from added_advisories() and updated_advisories()
+        :param last_run_date: Optional timestamp when this data source was last inspected
+        :param cutoff_date: Optional timestamp, records older than this will be ignored
+        :param config: Optional dictionary with subclass-specific configuration
+        """
+        config = config or {}
+        try:
+            self.config = self.__class__.CONFIG_CLASS(batch_size, **config)
+            # These really should be declared in DataSourceConfiguration above but that would prevent DataSource
+            # subclasses from declaring mandatory parameters (i.e. positional arguments)
+            setattr(self.config, 'last_run_date', last_run_date)
+            setattr(self.config, 'cutoff_date', cutoff_date)
+        except Exception as e:
+            raise InvalidConfigurationError(str(e))
+
+        self.validate_configuration()
 
     def __enter__(self):
         """
         Subclasses acquire per-run resources, such as network connections, file downloads, etc. here.
         """
-        return self
+        pass
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """
         Subclasses release per-run resources acquired in __enter__() here.
         """
         pass
 
-    def added_advisories(self):
+    def validate_configuration(self) -> None:
+        """
+        Subclasses can perform more complex validation than what is handled by data classes and their type annotations.
+
+        This method is called in the constructor. It should raise InvalidConfigurationError with a human-readable
+        message.
+        """
+        pass
+
+    def added_advisories(self) -> List[Advisory]:
         """
         Subclasses yield batch_size sized batches of Advisory objects that have been added to the data source
         since self.cutoff_date.
         """
         raise StopIteration
 
-    def updated_advisories(self):
+    def updated_advisories(self) -> List[Advisory]:
         """
         Subclasses yield batch_size sized batches of Advisory objects that have been modified since
         self.cutoff_date.
 
         NOTE: Data sources that do not enable detection of changes to existing records vs added records must only
-              implement this method, not new_records(). The ImportRunner relies on this contract to decide between
+              implement this method, not added_advisories(). The ImportRunner relies on this contract to decide between
               insert and update operations.
         """
         raise StopIteration
 
+    def error(self, msg: str) -> None:
+        """
+        Helper method for raising InvalidConfigurationError with the class name in the message.
+        """
+        raise InvalidConfigurationError(f'{type(self).__name__}: {msg}')
+
 
 @dataclasses.dataclass
-class Advisory:
-    """
-    This data class expresses the contract between data sources and the import runner.
-    Data sources are expected to be usable as context managers and generators, yielding batches of Advisory sequences.
+class GitDataSourceConfiguration(DataSourceConfiguration):
+    repository_url: str
+    branch: Optional[str] = None
+    create_working_directory: bool = True
+    remove_working_directory: bool = True
+    working_directory: Optional[str] = None
 
-    NB: There are two representations for package URLs that are commonly used by code consuming this data class;
-        PackageURL objects and strings. As a convention, the former is referred to in variable names, etc. as
-        "package_urls" and the latter as "purls".
-    """
-    summary: str
-    impacted_package_urls: Sequence[PackageURL]
-    resolved_package_urls: Sequence[PackageURL] = dataclasses.field(default_factory=list)
-    references: Sequence[str] = dataclasses.field(default_factory=list)
-    cve_id: Optional[str] = None
 
-    @property
-    def impacted_purls(self):
-        return {str(p) for p in self.impacted_package_urls}
+class GitDataSource(DataSource):
+    CONFIG_CLASS = GitDataSourceConfiguration
 
-    @property
-    def resolved_purls(self):
-        return {str(p) for p in self.resolved_package_urls}
+    def validate_configuration(self) -> None:
+
+        if not self.config.create_working_directory and self.config.working_directory is None:
+            self.error('"create_working_directory" is not set but "working_directory" is set to the default, which '
+                       'calls tempfile.mkdtemp()')
+
+        if not self.config.create_working_directory and not os.path.exists(self.config.working_directory):
+            self.error('"working_directory" does not contain an existing directory and "create_working_directory" is '
+                       'not set')
+
+        if not self.config.remove_working_directory and self.config.working_directory is None:
+            self.error('"remove_working_directory" is not set and "working_directory" is set to the default, which '
+                       'calls tempfile.mkdtemp()')
+
+    def __enter__(self):
+        self._ensure_working_directory()
+        self._ensure_repository()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.config.remove_working_directory:
+            shutil.rmtree(self.config.working_directory)
+
+    def added_advisories(self) -> List[Advisory]:
+        raise NotImplementedError
+
+    def updated_advisories(self) -> List[Advisory]:
+        raise NotImplementedError
+
+    # TODO Sort out cutoff_date vs last_run_date. The former is "no entries older than one year",
+    # TODO not "the importer was last run on"
+    def added_files(
+            self,
+            subdir: str = None,
+            recursive: bool = False,
+            file_ext: Optional[str] = None
+    ) -> List[str]:
+
+        if subdir is None:
+            working_dir = self.config.working_directory
+        else:
+            working_dir = os.path.join(self.config.working_directory, subdir)
+
+        path = Path(working_dir)
+
+        if self.config.cutoff_date is None:
+            if recursive:
+                glob = '**/*'
+            else:
+                glob = '*'
+
+            if file_ext:
+                glob = f'{glob}.{file_ext}'
+
+            return [str(p.relative_to(working_dir)) for p in path.glob(glob) if p.is_file()]
+
+        return self._collect_files(pygit2.GIT_DELTA_ADDED, subdir, recursive, file_ext)
+
+    def updated_files(
+            self,
+            subdir: str = None,
+            recursive: bool = False,
+            file_ext: str = None
+    ) -> List[str]:
+
+        if self.config.cutoff_date is None:
+            return []
+
+        return self._collect_files(pygit2.GIT_DELTA_MODIFIED, subdir, recursive, file_ext)
+
+    # TODO Just filtering on the two status values for "added" and "modified" is too simplistic.
+    # TODO This does not cover file renames, copies & deletions.
+    def _collect_files(
+            self,
+            delta_status: int,
+            subdir: Optional[str],
+            recursive: bool,
+            file_ext: Optional[str],
+    ) -> List[str]:
+
+        cutoff = 0 if self.config.cutoff_date is None else int(self.config.cutoff_date.timestamp())
+        previous_commit = None
+        files = []
+
+        for commit in self._repo.walk(self._repo.head.target, pygit2.GIT_SORT_TIME):
+            if previous_commit is None:
+                previous_commit = commit
+                continue
+
+            deltas = commit.tree.diff_to_tree(previous_commit.tree).deltas
+            for d in deltas:
+                path = d.new_file.path
+
+                if d.status == delta_status and not d.is_binary and _include_file(path, subdir, recursive, file_ext):
+                    files.append(path)
+
+            if commit.commit_time < cutoff:
+                break
+
+            previous_commit = commit
+
+        return files
+
+    def _ensure_working_directory(self) -> None:
+        if self.config.working_directory is None:
+            self.config.working_directory = tempfile.mkdtemp()
+        elif self.config.create_working_directory and not os.path.exists(self.config.working_directory):
+            os.mkdir(self.config.working_directory)
+
+    def _ensure_repository(self) -> None:
+        repodir = pygit2.discover_repository(self.config.working_directory)
+        if repodir is None:
+            self._clone_repository()
+            return
+
+        self._repo = pygit2.Repository(repodir)
+
+        if self.config.branch is None:
+            self.config.branch = self._repo.head.shorthand
+        branch = self._repo.branches[self.config.branch]
+
+        if not branch.is_checked_out():
+            self._repo.checkout(branch)
+
+        remote = self._find_or_add_remote()
+        progress = remote.fetch()
+        if progress.received_objects == 0:
+            return
+
+        remote_branch = self._repo.branches[f'{remote.name}/{self.config.branch}']
+        branch.set_target(remote_branch.target)
+        self._repo.checkout(branch, strategy=pygit2.GIT_CHECKOUT_FORCE)
+
+    def _clone_repository(self):
+        kwargs = {}
+        if getattr(self, 'branch', False):
+            kwargs['checkout_branch'] = self.config.branch
+
+        self._repo = pygit2.clone_repository(self.config.repository_url, self.config.working_directory, **kwargs)
+
+    def _find_or_add_remote(self):
+        remote = None
+        for r in self._repo.remotes:
+            if r.url == self.config.repository_url:
+                remote = r
+                break
+
+        if remote is None:
+            remote = self._repo.remotes.create('added_by_vulnerablecode', self.config.repository_url)
+
+        return remote
+
+
+def _include_file(
+        path: str,
+        subdir: Optional[str] = None,
+        recursive: bool = False,
+        file_ext: Optional[str] = None,
+) -> bool:
+    match = True
+
+    if subdir:
+        if not subdir.endswith(os.path.sep):
+            subdir = f'{subdir}{os.path.sep}'
+
+        match = match and path.startswith(subdir)
+
+    if not recursive:
+        match = match and (os.path.sep not in path[len(subdir or ''):])
+
+    if file_ext:
+        match = match and path.endswith(f'.{file_ext}')
+
+    return match
@@ -124,4 +124,6 @@ def make_data_source(self, cutoff_date=None, batch_size=None) -> DataSource:
         cd = cutoff_date or self.last_run
         importers_module = importlib.import_module('vulnerabilities.importers')
         klass = getattr(importers_module, self.data_source)
-        return klass(cutoff_date=cd, batch_size=batch_size, config=self.data_source_cfg)
+        ds = klass(cutoff_date=cd, batch_size=batch_size, config=self.data_source_cfg)
+        ds.apply_config()
+        return ds