|
20 | 20 | # VulnerableCode is a free software code scanning tool from nexB Inc. and others.
|
21 | 21 | # Visit https://github.com/nexB/vulnerablecode/ for support and download.
|
22 | 22 |
|
| 23 | +import dataclasses |
| 24 | +import os |
| 25 | +import shutil |
| 26 | +import tempfile |
23 | 27 | from datetime import datetime
|
| 28 | +from pathlib import Path |
24 | 29 | from typing import Any
|
25 | 30 | from typing import ContextManager
|
| 31 | +from typing import List |
26 | 32 | from typing import Mapping
|
27 | 33 | from typing import Optional
|
28 | 34 | from typing import Sequence
|
29 |
| -import dataclasses |
| 35 | +from typing import Set |
30 | 36 |
|
| 37 | +import pygit2 |
31 | 38 | from packageurl import PackageURL
|
32 | 39 |
|
33 | 40 |
|
34 | 41 | @dataclasses.dataclass
|
| 42 | +class Advisory: |
| 43 | + """ |
| 44 | + This data class expresses the contract between data sources and the import runner. |
| 45 | + Data sources are expected to be usable as context managers and generators, yielding batches of Advisory sequences. |
| 46 | +
|
| 47 | + NB: There are two representations for package URLs that are commonly used by code consuming this data class; |
| 48 | + PackageURL objects and strings. As a convention, the former is referred to in variable names, etc. as |
| 49 | + "package_urls" and the latter as "purls". |
| 50 | + """ |
| 51 | + summary: str |
| 52 | + impacted_package_urls: Sequence[PackageURL] |
| 53 | + resolved_package_urls: Sequence[PackageURL] = dataclasses.field(default_factory=list) |
| 54 | + references: Sequence[str] = dataclasses.field(default_factory=list) |
| 55 | + cve_id: Optional[str] = None |
| 56 | + |
| 57 | + @property |
| 58 | + def impacted_purls(self) -> Set[str]: |
| 59 | + return {str(p) for p in self.impacted_package_urls} |
| 60 | + |
| 61 | + @property |
| 62 | + def resolved_purls(self) -> Set[str]: |
| 63 | + return {str(p) for p in self.resolved_package_urls} |
| 64 | + |
| 65 | + |
| 66 | +class InvalidConfigurationError(Exception): |
| 67 | + pass |
| 68 | + |
| 69 | + |
| 70 | +@dataclasses.dataclass |
| 71 | +class DataSourceConfiguration: |
| 72 | + batch_size: int |
| 73 | + |
| 74 | + |
35 | 75 | class DataSource(ContextManager):
|
36 | 76 | """
|
37 | 77 | This class defines how importers consume advisories from a data source.
|
38 | 78 |
|
39 | 79 | It makes a distinction between newly added records since the last run and modified records. This allows the import
|
40 | 80 | logic to pick appropriate database operations.
|
41 | 81 | """
|
42 |
| - batch_size: int |
43 |
| - cutoff_date: Optional[datetime] = None |
44 |
| - config: Optional[Mapping[str, Any]] = dataclasses.field(default_factory=dict) |
| 82 | + |
| 83 | + CONFIG_CLASS = DataSourceConfiguration |
| 84 | + |
| 85 | + def __init__( |
| 86 | + self, |
| 87 | + batch_size: int, |
| 88 | + last_run_date: Optional[datetime] = None, |
| 89 | + cutoff_date: Optional[datetime] = None, |
| 90 | + config: Optional[Mapping[str, Any]] = None, |
| 91 | + ): |
| 92 | + """ |
| 93 | + Create a DataSource instance. |
| 94 | +
|
| 95 | + :param batch_size: Maximum number of records to return from added_advisories() and updated_advisories() |
| 96 | + :param last_run_date: Optional timestamp when this data source was last inspected |
| 97 | + :param cutoff_date: Optional timestamp, records older than this will be ignored |
| 98 | + :param config: Optional dictionary with subclass-specific configuration |
| 99 | + """ |
| 100 | + config = config or {} |
| 101 | + try: |
| 102 | + self.config = self.__class__.CONFIG_CLASS(batch_size, **config) |
| 103 | + # These really should be declared in DataSourceConfiguration above but that would prevent DataSource |
| 104 | + # subclasses from declaring mandatory parameters (i.e. positional arguments) |
| 105 | + setattr(self.config, 'last_run_date', last_run_date) |
| 106 | + setattr(self.config, 'cutoff_date', cutoff_date) |
| 107 | + except Exception as e: |
| 108 | + raise InvalidConfigurationError(str(e)) |
| 109 | + |
| 110 | + self.validate_configuration() |
45 | 111 |
|
46 | 112 | def __enter__(self):
|
47 | 113 | """
|
48 | 114 | Subclasses acquire per-run resources, such as network connections, file downloads, etc. here.
|
49 | 115 | """
|
50 |
| - return self |
| 116 | + pass |
51 | 117 |
|
52 | 118 | def __exit__(self, exc_type, exc_val, exc_tb):
|
53 | 119 | """
|
54 | 120 | Subclasses release per-run resources acquired in __enter__() here.
|
55 | 121 | """
|
56 | 122 | pass
|
57 | 123 |
|
58 |
| - def added_advisories(self): |
| 124 | + def validate_configuration(self) -> None: |
| 125 | + """ |
| 126 | + Subclasses can perform more complex validation than what is handled by data classes and their type annotations. |
| 127 | +
|
| 128 | + This method is called in the constructor. It should raise InvalidConfigurationError with a human-readable |
| 129 | + message. |
| 130 | + """ |
| 131 | + pass |
| 132 | + |
| 133 | + def added_advisories(self) -> List[Advisory]: |
59 | 134 | """
|
60 | 135 | Subclasses yield batch_size sized batches of Advisory objects that have been added to the data source
|
61 | 136 | since self.cutoff_date.
|
62 | 137 | """
|
63 | 138 | raise StopIteration
|
64 | 139 |
|
65 |
| - def updated_advisories(self): |
| 140 | + def updated_advisories(self) -> List[Advisory]: |
66 | 141 | """
|
67 | 142 | Subclasses yield batch_size sized batches of Advisory objects that have been modified since
|
68 | 143 | self.cutoff_date.
|
69 | 144 |
|
70 | 145 | NOTE: Data sources that do not enable detection of changes to existing records vs added records must only
|
71 |
| - implement this method, not new_records(). The ImportRunner relies on this contract to decide between |
| 146 | + implement this method, not added_advisories(). The ImportRunner relies on this contract to decide between |
72 | 147 | insert and update operations.
|
73 | 148 | """
|
74 | 149 | raise StopIteration
|
75 | 150 |
|
| 151 | + def error(self, msg: str) -> None: |
| 152 | + """ |
| 153 | + Helper method for raising InvalidConfigurationError with the class name in the message. |
| 154 | + """ |
| 155 | + raise InvalidConfigurationError(f'{type(self).__name__}: {msg}') |
| 156 | + |
76 | 157 |
|
77 | 158 | @dataclasses.dataclass
|
78 |
| -class Advisory: |
79 |
| - """ |
80 |
| - This data class expresses the contract between data sources and the import runner. |
81 |
| - Data sources are expected to be usable as context managers and generators, yielding batches of Advisory sequences. |
| 159 | +class GitDataSourceConfiguration(DataSourceConfiguration): |
| 160 | + repository_url: str |
| 161 | + branch: Optional[str] = None |
| 162 | + create_working_directory: bool = True |
| 163 | + remove_working_directory: bool = True |
| 164 | + working_directory: Optional[str] = None |
82 | 165 |
|
83 |
| - NB: There are two representations for package URLs that are commonly used by code consuming this data class; |
84 |
| - PackageURL objects and strings. As a convention, the former is referred to in variable names, etc. as |
85 |
| - "package_urls" and the latter as "purls". |
86 |
| - """ |
87 |
| - summary: str |
88 |
| - impacted_package_urls: Sequence[PackageURL] |
89 |
| - resolved_package_urls: Sequence[PackageURL] = dataclasses.field(default_factory=list) |
90 |
| - references: Sequence[str] = dataclasses.field(default_factory=list) |
91 |
| - cve_id: Optional[str] = None |
92 | 166 |
|
93 |
| - @property |
94 |
| - def impacted_purls(self): |
95 |
| - return {str(p) for p in self.impacted_package_urls} |
| 167 | +class GitDataSource(DataSource): |
| 168 | + CONFIG_CLASS = GitDataSourceConfiguration |
96 | 169 |
|
97 |
| - @property |
98 |
| - def resolved_purls(self): |
99 |
| - return {str(p) for p in self.resolved_package_urls} |
| 170 | + def validate_configuration(self) -> None: |
| 171 | + |
| 172 | + if not self.config.create_working_directory and self.config.working_directory is None: |
| 173 | + self.error('"create_working_directory" is not set but "working_directory" is set to the default, which ' |
| 174 | + 'calls tempfile.mkdtemp()') |
| 175 | + |
| 176 | + if not self.config.create_working_directory and not os.path.exists(self.config.working_directory): |
| 177 | + self.error('"working_directory" does not contain an existing directory and "create_working_directory" is ' |
| 178 | + 'not set') |
| 179 | + |
| 180 | + if not self.config.remove_working_directory and self.config.working_directory is None: |
| 181 | + self.error('"remove_working_directory" is not set and "working_directory" is set to the default, which ' |
| 182 | + 'calls tempfile.mkdtemp()') |
| 183 | + |
| 184 | + def __enter__(self): |
| 185 | + self._ensure_working_directory() |
| 186 | + self._ensure_repository() |
| 187 | + |
| 188 | + def __exit__(self, exc_type, exc_val, exc_tb): |
| 189 | + if self.config.remove_working_directory: |
| 190 | + shutil.rmtree(self.config.working_directory) |
| 191 | + |
| 192 | + def added_advisories(self) -> List[Advisory]: |
| 193 | + raise NotImplementedError |
| 194 | + |
| 195 | + def updated_advisories(self) -> List[Advisory]: |
| 196 | + raise NotImplementedError |
| 197 | + |
| 198 | + # TODO Sort out cutoff_date vs last_run_date. The former is "no entries older than one year", |
| 199 | + # TODO not "the importer was last run on" |
| 200 | + def added_files( |
| 201 | + self, |
| 202 | + subdir: str = None, |
| 203 | + recursive: bool = False, |
| 204 | + file_ext: Optional[str] = None |
| 205 | + ) -> List[str]: |
| 206 | + |
| 207 | + if subdir is None: |
| 208 | + working_dir = self.config.working_directory |
| 209 | + else: |
| 210 | + working_dir = os.path.join(self.config.working_directory, subdir) |
| 211 | + |
| 212 | + path = Path(working_dir) |
| 213 | + |
| 214 | + if self.config.cutoff_date is None: |
| 215 | + if recursive: |
| 216 | + glob = '**/*' |
| 217 | + else: |
| 218 | + glob = '*' |
| 219 | + |
| 220 | + if file_ext: |
| 221 | + glob = f'{glob}.{file_ext}' |
| 222 | + |
| 223 | + return [str(p.relative_to(working_dir)) for p in path.glob(glob) if p.is_file()] |
| 224 | + |
| 225 | + return self._collect_files(pygit2.GIT_DELTA_ADDED, subdir, recursive, file_ext) |
| 226 | + |
| 227 | + def updated_files( |
| 228 | + self, |
| 229 | + subdir: str = None, |
| 230 | + recursive: bool = False, |
| 231 | + file_ext: str = None |
| 232 | + ) -> List[str]: |
| 233 | + |
| 234 | + if self.config.cutoff_date is None: |
| 235 | + return [] |
| 236 | + |
| 237 | + return self._collect_files(pygit2.GIT_DELTA_MODIFIED, subdir, recursive, file_ext) |
| 238 | + |
| 239 | + # TODO Just filtering on the two status values for "added" and "modified" is too simplistic. |
| 240 | + # TODO This does not cover file renames, copies & deletions. |
| 241 | + def _collect_files( |
| 242 | + self, |
| 243 | + delta_status: int, |
| 244 | + subdir: Optional[str], |
| 245 | + recursive: bool, |
| 246 | + file_ext: Optional[str], |
| 247 | + ) -> List[str]: |
| 248 | + |
| 249 | + cutoff = 0 if self.config.cutoff_date is None else int(self.config.cutoff_date.timestamp()) |
| 250 | + previous_commit = None |
| 251 | + files = [] |
| 252 | + |
| 253 | + for commit in self._repo.walk(self._repo.head.target, pygit2.GIT_SORT_TIME): |
| 254 | + if previous_commit is None: |
| 255 | + previous_commit = commit |
| 256 | + continue |
| 257 | + |
| 258 | + deltas = commit.tree.diff_to_tree(previous_commit.tree).deltas |
| 259 | + for d in deltas: |
| 260 | + path = d.new_file.path |
| 261 | + |
| 262 | + if d.status == delta_status and not d.is_binary and _include_file(path, subdir, recursive, file_ext): |
| 263 | + files.append(path) |
| 264 | + |
| 265 | + if commit.commit_time < cutoff: |
| 266 | + break |
| 267 | + |
| 268 | + previous_commit = commit |
| 269 | + |
| 270 | + return files |
| 271 | + |
| 272 | + def _ensure_working_directory(self) -> None: |
| 273 | + if self.config.working_directory is None: |
| 274 | + self.config.working_directory = tempfile.mkdtemp() |
| 275 | + elif self.config.create_working_directory and not os.path.exists(self.config.working_directory): |
| 276 | + os.mkdir(self.config.working_directory) |
| 277 | + |
| 278 | + def _ensure_repository(self) -> None: |
| 279 | + repodir = pygit2.discover_repository(self.config.working_directory) |
| 280 | + if repodir is None: |
| 281 | + self._clone_repository() |
| 282 | + return |
| 283 | + |
| 284 | + self._repo = pygit2.Repository(repodir) |
| 285 | + |
| 286 | + if self.config.branch is None: |
| 287 | + self.config.branch = self._repo.head.shorthand |
| 288 | + branch = self._repo.branches[self.config.branch] |
| 289 | + |
| 290 | + if not branch.is_checked_out(): |
| 291 | + self._repo.checkout(branch) |
| 292 | + |
| 293 | + remote = self._find_or_add_remote() |
| 294 | + progress = remote.fetch() |
| 295 | + if progress.received_objects == 0: |
| 296 | + return |
| 297 | + |
| 298 | + remote_branch = self._repo.branches[f'{remote.name}/{self.config.branch}'] |
| 299 | + branch.set_target(remote_branch.target) |
| 300 | + self._repo.checkout(branch, strategy=pygit2.GIT_CHECKOUT_FORCE) |
| 301 | + |
| 302 | + def _clone_repository(self): |
| 303 | + kwargs = {} |
| 304 | + if getattr(self, 'branch', False): |
| 305 | + kwargs['checkout_branch'] = self.config.branch |
| 306 | + |
| 307 | + self._repo = pygit2.clone_repository(self.config.repository_url, self.config.working_directory, **kwargs) |
| 308 | + |
| 309 | + def _find_or_add_remote(self): |
| 310 | + remote = None |
| 311 | + for r in self._repo.remotes: |
| 312 | + if r.url == self.config.repository_url: |
| 313 | + remote = r |
| 314 | + break |
| 315 | + |
| 316 | + if remote is None: |
| 317 | + remote = self._repo.remotes.create('added_by_vulnerablecode', self.config.repository_url) |
| 318 | + |
| 319 | + return remote |
| 320 | + |
| 321 | + |
| 322 | +def _include_file( |
| 323 | + path: str, |
| 324 | + subdir: Optional[str] = None, |
| 325 | + recursive: bool = False, |
| 326 | + file_ext: Optional[str] = None, |
| 327 | +) -> bool: |
| 328 | + match = True |
| 329 | + |
| 330 | + if subdir: |
| 331 | + if not subdir.endswith(os.path.sep): |
| 332 | + subdir = f'{subdir}{os.path.sep}' |
| 333 | + |
| 334 | + match = match and path.startswith(subdir) |
| 335 | + |
| 336 | + if not recursive: |
| 337 | + match = match and (os.path.sep not in path[len(subdir or ''):]) |
| 338 | + |
| 339 | + if file_ext: |
| 340 | + match = match and path.endswith(f'.{file_ext}') |
| 341 | + |
| 342 | + return match |
0 commit comments