Skip to content

Commit 08ef849

Browse files
committed
WIP
Signed-off-by: Haiko Schol <[email protected]>
1 parent 3265803 commit 08ef849

15 files changed

+359
-198
lines changed

vulnerabilities/admin.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,11 @@
2525

2626
from vulnerabilities.models import (
2727
ImpactedPackage,
28+
Importer,
2829
Package,
29-
PackageReference,
3030
ResolvedPackage,
3131
Vulnerability,
32-
VulnerabilityReference
32+
VulnerabilityReference,
3333
)
3434

3535

@@ -58,6 +58,6 @@ class ResolvedPackageAdmin(admin.ModelAdmin):
5858
pass
5959

6060

61-
@admin.register(PackageReference)
62-
class PackageReferenceAdmin(admin.ModelAdmin):
61+
@admin.register(Importer)
62+
class ImporterAdmin(admin.ModelAdmin):
6363
pass

vulnerabilities/api.py

-16
Original file line numberDiff line numberDiff line change
@@ -22,29 +22,15 @@
2222
# Visit https://github.com/nexB/vulnerablecode/ for support and download.
2323

2424
from rest_framework import serializers
25-
from rest_framework import status
2625
from rest_framework import viewsets
27-
from rest_framework.response import Response
2826

2927
from packageurl import PackageURL
3028

3129
from vulnerabilities.models import Package
32-
from vulnerabilities.models import PackageReference
3330
from vulnerabilities.models import Vulnerability
3431
from vulnerabilities.models import VulnerabilityReference
3532

3633

37-
class PackageReferenceSerializer(serializers.ModelSerializer):
38-
class Meta:
39-
model = PackageReference
40-
fields = [
41-
'repository',
42-
'platform',
43-
'name',
44-
'version',
45-
]
46-
47-
4834
class VulnerabilityReferenceSerializer(serializers.ModelSerializer):
4935
class Meta:
5036
model = VulnerabilityReference
@@ -69,7 +55,6 @@ class Meta:
6955

7056
class PackageSerializer(serializers.ModelSerializer):
7157
vulnerabilities = VulnerabilitySerializer(many=True)
72-
references = PackageReferenceSerializer(source='packagereference_set', many=True)
7358

7459
class Meta:
7560
model = Package
@@ -78,7 +63,6 @@ class Meta:
7863
'version',
7964
'package_url',
8065
'vulnerabilities',
81-
'references',
8266
]
8367

8468

vulnerabilities/data_dump.py

-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323

2424
from vulnerabilities.models import ImpactedPackage
2525
from vulnerabilities.models import Package
26-
from vulnerabilities.models import PackageReference
2726
from vulnerabilities.models import ResolvedPackage
2827
from vulnerabilities.models import Vulnerability
2928
from vulnerabilities.models import VulnerabilityReference

vulnerabilities/data_source.py

+61-21
Original file line numberDiff line numberDiff line change
@@ -20,61 +20,101 @@
2020
# VulnerableCode is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/nexB/vulnerablecode/ for support and download.
2222

23-
from attr import dataclass
2423
from datetime import datetime
2524
from typing import Any
2625
from typing import Mapping
26+
from typing import Optional
2727
from typing import Sequence
28+
from typing import Set
29+
import dataclasses
2830

31+
from packageurl import PackageURL
2932

30-
@dataclass
33+
34+
@dataclasses.dataclass
3135
class DataSource:
3236
"""
3337
TODO
3438
"""
3539
batch_size: int
36-
cutoff_date: datetime
37-
config: Mapping[str, Any]
40+
cutoff_date: Optional[datetime] = None
41+
config: Optional[Mapping[str, Any]] = dataclasses.field(default_factory=dict)
3842

3943
def __enter__(self):
4044
"""
4145
Subclasses acquire per-run resources, such as network connections, file downloads, etc. here.
4246
"""
43-
pass
47+
return self
4448

4549
def __exit__(self, exc_type, exc_val, exc_tb):
4650
"""
4751
Subclasses release per-run resources acquired in __enter__() here.
4852
"""
4953
pass
5054

51-
def __next__(self):
55+
def new_records(self):
5256
"""
53-
Subclasses return batch_size sized batches of VulnerabilityInfo objects
57+
Subclasses return batch_size sized batches of VulnerabilityInfo objects that have been added to the data source
58+
since self.cutoff_date.
5459
"""
55-
pass
60+
raise StopIteration
61+
62+
def updated_records(self):
63+
"""
64+
Subclasses return batch_size sized batches of VulnerabilityInfo objects that have been modified since
65+
self.cutoff_date.
66+
67+
NOTE: Data sources that do now enable detection of changes to existing records vs added records must only
68+
implement this method, not new_records(). The ImportRunner relies on this contract to decide between
69+
insert and update operations.
70+
"""
71+
raise StopIteration
5672

5773

5874
# The following data classes express the contract between data sources and the import runner.
59-
# Data sources are expected to be usable as context managers and generators, yielding
60-
# batches of VulnerabilityInfo sequences.
75+
# Data sources are expected to be usable as context managers and generators, yielding batches of VulnerabilityInfo
76+
# sequences.
6177

62-
@dataclass
78+
@dataclasses.dataclass(frozen=True)
6379
class Package:
6480
name: str
65-
namespace: str
6681
type: str
6782
version: str
68-
qualifiers: str
69-
subpath: str
70-
references: Sequence[str]
83+
namespace: str = ''
84+
qualifiers: str = ''
85+
subpath: str = ''
86+
87+
@property
88+
def package_url(self) -> str:
89+
"""
90+
Return a compact Package URL "purl" string.
91+
"""
92+
purl = PackageURL(
93+
self.type, self.namespace, self.name,
94+
self.version, self.qualifiers, self.subpath
95+
)
96+
return str(purl)
7197

7298

73-
@dataclass
99+
@dataclasses.dataclass
74100
class VulnerabilityInfo:
75-
cve_id: str
76101
summary: str
77-
affected_packages: Sequence[Package]
78-
unaffected_packages: Sequence[Package]
79-
fixed_packages: Sequence[Package]
80-
references: Sequence[str]
102+
impacted_packages: Sequence[Package]
103+
resolved_packages: Sequence[Package] = dataclasses.field(default_factory=list)
104+
references: Sequence[str] = dataclasses.field(default_factory=list)
105+
cve_id: Optional[str] = None
106+
107+
_impacted_purls: Set[str] = dataclasses.field(init=False, compare=False, default=None)
108+
_resolved_purls: Set[str] = dataclasses.field(init=False, compare=False, default=None)
109+
110+
@property
111+
def impacted_package_urls(self):
112+
if self._impacted_purls is None:
113+
self._impacted_purls = {p.package_url for p in self.impacted_packages}
114+
return self._impacted_purls
115+
116+
@property
117+
def resolved_package_urls(self):
118+
if self._resolved_purls is None:
119+
self._resolved_purls = {p.package_url for p in self.resolved_packages}
120+
return self._resolved_purls

vulnerabilities/import_runner.py

+169-9
Original file line numberDiff line numberDiff line change
@@ -23,29 +23,189 @@
2323

2424
import datetime
2525
import logging
26+
from typing import List
27+
from typing import Mapping
28+
from typing import Sequence
29+
from typing import Set
30+
from typing import Tuple
2631

32+
from django.db import IntegrityError
33+
from django.db import transaction
34+
35+
from vulnerabilities import models
36+
from vulnerabilities.data_source import Package
37+
from vulnerabilities.data_source import VulnerabilityInfo
2738

2839
logger = logging.getLogger(__name__)
2940

3041

31-
# TODO This really should use asyncio for network and database, but sadly the Django ORM won't allow it.
3242
class ImportRunner:
43+
"""
44+
The ImportRunner is responsible for inserting and updating data about vulnerabilities and
45+
affected/unaffected/fixed packages in the database. The two main goals for the implementation are correctness and
46+
efficiency.
47+
48+
Correctness:
49+
- There must be no duplicates in the database (should be enforced by the schema).
50+
- No valid data from the data source must be skipped or truncated.
3351
52+
Efficiency:
53+
- Bulk inserts should be used whenever possible.
54+
- Checking whether a record already exists should be kept to a minimum
55+
(the data source should know this instead).
56+
- All update and select operations must use indexed columns.
57+
"""
3458
def __init__(self, importer, batch_size=None):
3559
self.importer = importer
3660
self.batch_size = batch_size
3761

38-
def run(self, cutoff_date=None):
62+
def run(self, cutoff_date=None) -> None:
63+
"""
64+
Create a data source for the given importer and store the data retrieved in the database.
65+
66+
NB: Data sources provide two kinds of records; vulnerabilities and packages. Vulnerabilities are potentially
67+
shared across many packages, from the same data source and from different data sources. For example, a
68+
vulnerability in the Linux kernel is mentioned by advisories from all Linux distributions that package this
69+
kernel version.
70+
"""
3971
logger.debug(f'Starting import for {self.importer.name}.')
4072
data_source = self.importer.make_data_source(cutoff_date=cutoff_date, batch_size=self.batch_size)
73+
4174
with data_source as ds:
42-
for batch in ds:
43-
# TODO
44-
# Check if any Vulnerability or Package from this batch already exists in the DB
45-
# If not: Bulk insert everything
46-
# If yes: Update existing ones and bulk insert the rest
47-
pass
75+
for batch in ds.new_records():
76+
impacted, resolved = _collect_packages_from_batch(batch)
77+
impacted, resolved = _bulk_insert_packages(impacted, resolved)
78+
79+
vulnerabilities = _insert_vulnerabilities_and_references(batch)
80+
81+
_bulk_insert_impacted_and_resolved_packages(batch, vulnerabilities, impacted, resolved)
4882

4983
self.importer.last_run = datetime.datetime.utcnow()
5084
self.importer.save()
51-
logger.debug(f'Successfully finished import for {self.importer.name}.')
85+
86+
logger.debug(f'Successfully finished import for {self.importer.name}.')
87+
88+
89+
def _bulk_insert_impacted_and_resolved_packages(
90+
batch: Sequence[VulnerabilityInfo],
91+
vulnerability_models: Set[models.Vulnerability],
92+
impacted_package_models: Mapping[str, models.Package],
93+
resolved_package_models: Mapping[str, models.Package],
94+
) -> None:
95+
96+
impacted_refs: List[models.ImpactedPackage] = []
97+
resolved_refs: List[models.ResolvedPackage] = []
98+
99+
for vuln_info in batch:
100+
vuln_model = _vuln_info_to_vuln_model(vuln_info, vulnerability_models)
101+
vulnerability_models.remove(vuln_model) # minor optimization
102+
103+
for impacted_package in vuln_info.impacted_packages:
104+
ip = models.ImpactedPackage(
105+
vulnerability=vuln_model,
106+
package=impacted_package_models[impacted_package.package_url]
107+
)
108+
impacted_refs.append(ip)
109+
110+
for resolved_package in vuln_info.resolved_packages:
111+
ip = models.ResolvedPackage(
112+
vulnerability=vuln_model,
113+
package=resolved_package_models[resolved_package.package_url]
114+
)
115+
resolved_refs.append(ip)
116+
117+
models.ImpactedPackage.objects.bulk_create(impacted_refs)
118+
models.ResolvedPackage.objects.bulk_create(resolved_refs)
119+
120+
121+
@transaction.atomic
122+
def _insert_vulnerabilities_and_references(batch: Sequence[VulnerabilityInfo]) -> Set[models.Vulnerability]:
123+
"""
124+
TODO Consider refactoring to use bulk_create() and avoid get_or_create() when possible.
125+
"""
126+
vulnerabilities = set()
127+
128+
for vuln_info in batch:
129+
vuln: models.Vulnerability
130+
131+
if vuln_info.cve_id:
132+
vuln, created = models.Vulnerability.objects.get_or_create(cve_id=vuln_info.cve_id)
133+
if created:
134+
vuln.summary = vuln_info.summary
135+
vuln.save()
136+
else:
137+
# FIXME Currently there is no way to check whether a vulnerability without a CVE ID already exists in the
138+
# FIXME database.
139+
vuln = models.Vulnerability.objects.create(summary=vuln_info.summary)
140+
141+
vulnerabilities.add(vuln)
142+
143+
for url in vuln_info.references:
144+
try:
145+
models.VulnerabilityReference.objects.create(vulnerability=vuln, url=url)
146+
except IntegrityError:
147+
# This vulnerability reference already exists, nothing to do.
148+
# TODO Find a more efficient way to do this rather than trying and ignoring any errors.
149+
pass
150+
151+
return vulnerabilities
152+
153+
154+
def _vuln_info_to_vuln_model(
155+
vuln_info: VulnerabilityInfo,
156+
vulnerability_models: Set[models.Vulnerability]
157+
) -> models.Vulnerability:
158+
159+
for v in vulnerability_models:
160+
if vuln_info.cve_id and vuln_info.cve_id == v.cve_id:
161+
return v
162+
163+
if vuln_info.summary == v.summary:
164+
return v
165+
166+
raise RuntimeError(f'No Vulnerability model object found for this VulnerabilityInfo: {vuln_info.summary}')
167+
168+
169+
def _collect_packages_from_batch(batch: Sequence[VulnerabilityInfo]) -> Tuple[Set[Package], Set[Package]]:
170+
impacted, resolved = set(), set()
171+
172+
for vuln_info in batch:
173+
impacted.update(vuln_info.impacted_packages)
174+
resolved.update(vuln_info.resolved_packages)
175+
176+
return impacted, resolved
177+
178+
179+
def _bulk_insert_packages(
180+
impacted: Set[Package],
181+
resolved: Set[Package]
182+
) -> Tuple[Mapping[str, Package], Mapping[str, Package]]:
183+
184+
pkg_models = [_to_package_model(p) for p in impacted.union(resolved)]
185+
pkg_models = models.Package.objects.bulk_create(pkg_models)
186+
187+
impacted_purls = {p.package_url for p in impacted}
188+
resolved_purls = {p.package_url for p in resolved}
189+
190+
impacted_models, resolved_models = {}, {}
191+
192+
for pkg_model in pkg_models:
193+
purl = pkg_model.package_url
194+
195+
if purl in impacted_purls:
196+
impacted_models[purl] = pkg_model
197+
elif purl in resolved_purls:
198+
resolved_models[purl] = pkg_model
199+
200+
return impacted_models, resolved_models
201+
202+
203+
def _to_package_model(pkg: Package) -> models.Package:
204+
return models.Package(
205+
name=pkg.name,
206+
type=pkg.type,
207+
version=pkg.version,
208+
namespace=pkg.namespace,
209+
qualifiers=pkg.qualifiers,
210+
subpath=pkg.subpath,
211+
)
File renamed without changes.

0 commit comments

Comments
 (0)