Skip to content

Commit 6c4c5a1

Browse files
committed
WIP
Signed-off-by: Haiko Schol <[email protected]>
1 parent 7f1e8ab commit 6c4c5a1

File tree

86 files changed

+2908
-30
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+2908
-30
lines changed

requirements.txt

+7-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
asgiref==3.2.7
12
attrs==19.3.0
23
beautifulsoup4==4.7.1
4+
cached-property==1.5.1
5+
cffi==1.14.0
36
dephell-specifier==0.2.1
47
dj-database-url==0.4.2
58
Django==3.0.3
@@ -15,20 +18,22 @@ pluggy==0.13.1
1518
psycopg2==2.8.4
1619
py==1.8.0
1720
pycodestyle==2.5.0
21+
pycparser==2.20
22+
pygit2==1.2.0
1823
pyparsing==2.4.5
1924
pytest==5.3.2
2025
pytest-dependency==0.4.0
2126
pytest-django==3.7.0
2227
pytest-mock==1.13.0
28+
pytoml==0.1.21
2329
pytz==2019.3
2430
PyYAML==5.3
2531
saneyaml==0.4
32+
schema==0.7.1
2633
six==1.13.0
2734
soupsieve==1.9.5
2835
sqlparse==0.3.0
2936
tqdm==4.41.1
3037
wcwidth==0.1.7
3138
whitenoise==5.0.1
3239
zipp==0.6.0
33-
pytoml==0.1.21
34-
schema==0.7.1

vulnerabilities/data_source.py

+270-27
Original file line numberDiff line numberDiff line change
@@ -20,80 +20,323 @@
2020
# VulnerableCode is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/nexB/vulnerablecode/ for support and download.
2222

23+
import dataclasses
24+
import os
25+
import shutil
26+
import tempfile
2327
from datetime import datetime
28+
from pathlib import Path
2429
from typing import Any
2530
from typing import ContextManager
31+
from typing import List
2632
from typing import Mapping
2733
from typing import Optional
2834
from typing import Sequence
29-
import dataclasses
35+
from typing import Set
3036

37+
import pygit2
3138
from packageurl import PackageURL
3239

3340

3441
@dataclasses.dataclass
42+
class Advisory:
43+
"""
44+
This data class expresses the contract between data sources and the import runner.
45+
Data sources are expected to be usable as context managers and generators, yielding batches of Advisory sequences.
46+
47+
NB: There are two representations for package URLs that are commonly used by code consuming this data class;
48+
PackageURL objects and strings. As a convention, the former is referred to in variable names, etc. as
49+
"package_urls" and the latter as "purls".
50+
"""
51+
summary: str
52+
impacted_package_urls: Sequence[PackageURL]
53+
resolved_package_urls: Sequence[PackageURL] = dataclasses.field(default_factory=list)
54+
references: Sequence[str] = dataclasses.field(default_factory=list)
55+
cve_id: Optional[str] = None
56+
57+
@property
58+
def impacted_purls(self) -> Set[str]:
59+
return {str(p) for p in self.impacted_package_urls}
60+
61+
@property
62+
def resolved_purls(self) -> Set[str]:
63+
return {str(p) for p in self.resolved_package_urls}
64+
65+
66+
class InvalidConfigurationError(Exception):
67+
pass
68+
69+
70+
@dataclasses.dataclass
71+
class DataSourceConfiguration:
72+
batch_size: int
73+
74+
3575
class DataSource(ContextManager):
3676
"""
3777
This class defines how importers consume advisories from a data source.
3878
3979
It makes a distinction between newly added records since the last run and modified records. This allows the import
4080
logic to pick appropriate database operations.
4181
"""
42-
batch_size: int
43-
cutoff_date: Optional[datetime] = None
44-
config: Optional[Mapping[str, Any]] = dataclasses.field(default_factory=dict)
82+
83+
CONFIG_CLASS = DataSourceConfiguration
84+
85+
def __init__(
86+
self,
87+
batch_size: int,
88+
last_run_date: Optional[datetime] = None,
89+
cutoff_date: Optional[datetime] = None,
90+
config: Optional[Mapping[str, Any]] = None,
91+
):
92+
"""
93+
Create a DataSource instance.
94+
95+
:param batch_size: Maximum number of records to return from added_advisories() and updated_advisories()
96+
:param last_run_date: Optional timestamp when this data source was last inspected
97+
:param cutoff_date: Optional timestamp, records older than this will be ignored
98+
:param config: Optional dictionary with subclass-specific configuration
99+
"""
100+
config = config or {}
101+
try:
102+
self.config = self.__class__.CONFIG_CLASS(batch_size, **config)
103+
# These really should be declared in DataSourceConfiguration above but that would prevent DataSource
104+
# subclasses from declaring mandatory parameters (i.e. positional arguments)
105+
setattr(self.config, 'last_run_date', last_run_date)
106+
setattr(self.config, 'cutoff_date', cutoff_date)
107+
except Exception as e:
108+
raise InvalidConfigurationError(str(e))
109+
110+
self.validate_configuration()
45111

46112
def __enter__(self):
47113
"""
48114
Subclasses acquire per-run resources, such as network connections, file downloads, etc. here.
49115
"""
50-
return self
116+
pass
51117

52118
def __exit__(self, exc_type, exc_val, exc_tb):
53119
"""
54120
Subclasses release per-run resources acquired in __enter__() here.
55121
"""
56122
pass
57123

58-
def added_advisories(self):
124+
def validate_configuration(self) -> None:
125+
"""
126+
Subclasses can perform more complex validation than what is handled by data classes and their type annotations.
127+
128+
This method is called in the constructor. It should raise InvalidConfigurationError with a human-readable
129+
message.
130+
"""
131+
pass
132+
133+
def added_advisories(self) -> List[Advisory]:
59134
"""
60135
Subclasses yield batch_size sized batches of Advisory objects that have been added to the data source
61136
since self.cutoff_date.
62137
"""
63138
raise StopIteration
64139

65-
def updated_advisories(self):
140+
def updated_advisories(self) -> List[Advisory]:
66141
"""
67142
Subclasses yield batch_size sized batches of Advisory objects that have been modified since
68143
self.cutoff_date.
69144
70145
NOTE: Data sources that do not enable detection of changes to existing records vs added records must only
71-
implement this method, not new_records(). The ImportRunner relies on this contract to decide between
146+
implement this method, not added_advisories(). The ImportRunner relies on this contract to decide between
72147
insert and update operations.
73148
"""
74149
raise StopIteration
75150

151+
def error(self, msg: str) -> None:
152+
"""
153+
Helper method for raising InvalidConfigurationError with the class name in the message.
154+
"""
155+
raise InvalidConfigurationError(f'{type(self).__name__}: {msg}')
156+
76157

77158
@dataclasses.dataclass
78-
class Advisory:
79-
"""
80-
This data class expresses the contract between data sources and the import runner.
81-
Data sources are expected to be usable as context managers and generators, yielding batches of Advisory sequences.
159+
class GitDataSourceConfiguration(DataSourceConfiguration):
160+
repository_url: str
161+
branch: Optional[str] = None
162+
create_working_directory: bool = True
163+
remove_working_directory: bool = True
164+
working_directory: Optional[str] = None
82165

83-
NB: There are two representations for package URLs that are commonly used by code consuming this data class;
84-
PackageURL objects and strings. As a convention, the former is referred to in variable names, etc. as
85-
"package_urls" and the latter as "purls".
86-
"""
87-
summary: str
88-
impacted_package_urls: Sequence[PackageURL]
89-
resolved_package_urls: Sequence[PackageURL] = dataclasses.field(default_factory=list)
90-
references: Sequence[str] = dataclasses.field(default_factory=list)
91-
cve_id: Optional[str] = None
92166

93-
@property
94-
def impacted_purls(self):
95-
return {str(p) for p in self.impacted_package_urls}
167+
class GitDataSource(DataSource):
168+
CONFIG_CLASS = GitDataSourceConfiguration
96169

97-
@property
98-
def resolved_purls(self):
99-
return {str(p) for p in self.resolved_package_urls}
170+
def validate_configuration(self) -> None:
171+
172+
if not self.config.create_working_directory and self.config.working_directory is None:
173+
self.error('"create_working_directory" is not set but "working_directory" is set to the default, which '
174+
'calls tempfile.mkdtemp()')
175+
176+
if not self.config.create_working_directory and not os.path.exists(self.config.working_directory):
177+
self.error('"working_directory" does not contain an existing directory and "create_working_directory" is '
178+
'not set')
179+
180+
if not self.config.remove_working_directory and self.config.working_directory is None:
181+
self.error('"remove_working_directory" is not set and "working_directory" is set to the default, which '
182+
'calls tempfile.mkdtemp()')
183+
184+
def __enter__(self):
185+
self._ensure_working_directory()
186+
self._ensure_repository()
187+
188+
def __exit__(self, exc_type, exc_val, exc_tb):
189+
if self.config.remove_working_directory:
190+
shutil.rmtree(self.config.working_directory)
191+
192+
def added_advisories(self) -> List[Advisory]:
193+
raise NotImplementedError
194+
195+
def updated_advisories(self) -> List[Advisory]:
196+
raise NotImplementedError
197+
198+
# TODO Sort out cutoff_date vs last_run_date. The former is "no entries older than one year",
199+
# TODO not "the importer was last run on"
200+
def added_files(
201+
self,
202+
subdir: str = None,
203+
recursive: bool = False,
204+
file_ext: Optional[str] = None
205+
) -> List[str]:
206+
207+
if subdir is None:
208+
working_dir = self.config.working_directory
209+
else:
210+
working_dir = os.path.join(self.config.working_directory, subdir)
211+
212+
path = Path(working_dir)
213+
214+
if self.config.cutoff_date is None:
215+
if recursive:
216+
glob = '**/*'
217+
else:
218+
glob = '*'
219+
220+
if file_ext:
221+
glob = f'{glob}.{file_ext}'
222+
223+
return [str(p.relative_to(working_dir)) for p in path.glob(glob) if p.is_file()]
224+
225+
return self._collect_files(pygit2.GIT_DELTA_ADDED, subdir, recursive, file_ext)
226+
227+
def updated_files(
228+
self,
229+
subdir: str = None,
230+
recursive: bool = False,
231+
file_ext: str = None
232+
) -> List[str]:
233+
234+
if self.config.cutoff_date is None:
235+
return []
236+
237+
return self._collect_files(pygit2.GIT_DELTA_MODIFIED, subdir, recursive, file_ext)
238+
239+
# TODO Just filtering on the two status values for "added" and "modified" is too simplistic.
240+
# TODO This does not cover file renames, copies & deletions.
241+
def _collect_files(
242+
self,
243+
delta_status: int,
244+
subdir: Optional[str],
245+
recursive: bool,
246+
file_ext: Optional[str],
247+
) -> List[str]:
248+
249+
cutoff = 0 if self.config.cutoff_date is None else int(self.config.cutoff_date.timestamp())
250+
previous_commit = None
251+
files = []
252+
253+
for commit in self._repo.walk(self._repo.head.target, pygit2.GIT_SORT_TIME):
254+
if previous_commit is None:
255+
previous_commit = commit
256+
continue
257+
258+
deltas = commit.tree.diff_to_tree(previous_commit.tree).deltas
259+
for d in deltas:
260+
path = d.new_file.path
261+
262+
if d.status == delta_status and not d.is_binary and _include_file(path, subdir, recursive, file_ext):
263+
files.append(path)
264+
265+
if commit.commit_time < cutoff:
266+
break
267+
268+
previous_commit = commit
269+
270+
return files
271+
272+
def _ensure_working_directory(self) -> None:
273+
if self.config.working_directory is None:
274+
self.config.working_directory = tempfile.mkdtemp()
275+
elif self.config.create_working_directory and not os.path.exists(self.config.working_directory):
276+
os.mkdir(self.config.working_directory)
277+
278+
def _ensure_repository(self) -> None:
279+
repodir = pygit2.discover_repository(self.config.working_directory)
280+
if repodir is None:
281+
self._clone_repository()
282+
return
283+
284+
self._repo = pygit2.Repository(repodir)
285+
286+
if self.config.branch is None:
287+
self.config.branch = self._repo.head.shorthand
288+
branch = self._repo.branches[self.config.branch]
289+
290+
if not branch.is_checked_out():
291+
self._repo.checkout(branch)
292+
293+
remote = self._find_or_add_remote()
294+
progress = remote.fetch()
295+
if progress.received_objects == 0:
296+
return
297+
298+
remote_branch = self._repo.branches[f'{remote.name}/{self.config.branch}']
299+
branch.set_target(remote_branch.target)
300+
self._repo.checkout(branch, strategy=pygit2.GIT_CHECKOUT_FORCE)
301+
302+
def _clone_repository(self):
303+
kwargs = {}
304+
if getattr(self, 'branch', False):
305+
kwargs['checkout_branch'] = self.config.branch
306+
307+
self._repo = pygit2.clone_repository(self.config.repository_url, self.config.working_directory, **kwargs)
308+
309+
def _find_or_add_remote(self):
310+
remote = None
311+
for r in self._repo.remotes:
312+
if r.url == self.config.repository_url:
313+
remote = r
314+
break
315+
316+
if remote is None:
317+
remote = self._repo.remotes.create('added_by_vulnerablecode', self.config.repository_url)
318+
319+
return remote
320+
321+
322+
def _include_file(
323+
path: str,
324+
subdir: Optional[str] = None,
325+
recursive: bool = False,
326+
file_ext: Optional[str] = None,
327+
) -> bool:
328+
match = True
329+
330+
if subdir:
331+
if not subdir.endswith(os.path.sep):
332+
subdir = f'{subdir}{os.path.sep}'
333+
334+
match = match and path.startswith(subdir)
335+
336+
if not recursive:
337+
match = match and (os.path.sep not in path[len(subdir or ''):])
338+
339+
if file_ext:
340+
match = match and path.endswith(f'.{file_ext}')
341+
342+
return match

vulnerabilities/models.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -124,4 +124,6 @@ def make_data_source(self, cutoff_date=None, batch_size=None) -> DataSource:
124124
cd = cutoff_date or self.last_run
125125
importers_module = importlib.import_module('vulnerabilities.importers')
126126
klass = getattr(importers_module, self.data_source)
127-
return klass(cutoff_date=cd, batch_size=batch_size, config=self.data_source_cfg)
127+
ds = klass(cutoff_date=cd, batch_size=batch_size, config=self.data_source_cfg)
128+
ds.apply_config()
129+
return ds

0 commit comments

Comments
 (0)