23
23
24
24
import datetime
25
25
import logging
26
+ from typing import List
27
+ from typing import Mapping
28
+ from typing import Sequence
29
+ from typing import Set
30
+ from typing import Tuple
26
31
32
+ from django .db import IntegrityError
33
+ from django .db import transaction
34
+
35
+ from vulnerabilities import models
36
+ from vulnerabilities .data_source import Package
37
+ from vulnerabilities .data_source import VulnerabilityInfo
27
38
28
39
logger = logging .getLogger (__name__ )
29
40
30
41
31
- # TODO This really should use asyncio for network and database, but sadly the Django ORM won't allow it.
32
42
class ImportRunner :
43
+ """
44
+ The ImportRunner is responsible for inserting and updating data about vulnerabilities and
45
+ affected/unaffected/fixed packages in the database. The two main goals for the implementation are correctness and
46
+ efficiency.
47
+
48
+ Correctness:
49
+ - There must be no duplicates in the database (should be enforced by the schema).
50
+ - No valid data from the data source must be skipped or truncated.
33
51
52
+ Efficiency:
53
+ - Bulk inserts should be used whenever possible.
54
+ - Checking whether a record already exists should be kept to a minimum
55
+ (the data source should know this instead).
56
+ - All update and select operations must use indexed columns.
57
+ """
34
58
def __init__ (self , importer , batch_size = None ):
35
59
self .importer = importer
36
60
self .batch_size = batch_size
37
61
38
- def run (self , cutoff_date = None ):
62
+ def run (self , cutoff_date = None ) -> None :
63
+ """
64
+ Create a data source for the given importer and store the data retrieved in the database.
65
+
66
+ NB: Data sources provide two kinds of records; vulnerabilities and packages. Vulnerabilities are potentially
67
+ shared across many packages, from the same data source and from different data sources. For example, a
68
+ vulnerability in the Linux kernel is mentioned by advisories from all Linux distributions that package this
69
+ kernel version.
70
+ """
39
71
logger .debug (f'Starting import for { self .importer .name } .' )
40
72
data_source = self .importer .make_data_source (cutoff_date = cutoff_date , batch_size = self .batch_size )
73
+
41
74
with data_source as ds :
42
- for batch in ds :
43
- # TODO
44
- # Check if any Vulnerability or Package from this batch already exists in the DB
45
- # If not: Bulk insert everything
46
- # If yes: Update existing ones and bulk insert the rest
47
- pass
75
+ for batch in ds .new_records ():
76
+ impacted , resolved = _collect_packages_from_batch (batch )
77
+ impacted , resolved = _bulk_insert_packages (impacted , resolved )
78
+
79
+ vulnerabilities = _insert_vulnerabilities_and_references (batch )
80
+
81
+ _bulk_insert_impacted_and_resolved_packages (batch , vulnerabilities , impacted , resolved )
48
82
49
83
self .importer .last_run = datetime .datetime .utcnow ()
50
84
self .importer .save ()
51
- logger .debug (f'Successfully finished import for { self .importer .name } .' )
85
+
86
+ logger .debug (f'Successfully finished import for { self .importer .name } .' )
87
+
88
+
89
+ def _bulk_insert_impacted_and_resolved_packages (
90
+ batch : Sequence [VulnerabilityInfo ],
91
+ vulnerability_models : Set [models .Vulnerability ],
92
+ impacted_package_models : Mapping [str , models .Package ],
93
+ resolved_package_models : Mapping [str , models .Package ],
94
+ ) -> None :
95
+
96
+ impacted_refs : List [models .ImpactedPackage ] = []
97
+ resolved_refs : List [models .ResolvedPackage ] = []
98
+
99
+ for vuln_info in batch :
100
+ vuln_model = _vuln_info_to_vuln_model (vuln_info , vulnerability_models )
101
+ vulnerability_models .remove (vuln_model ) # minor optimization
102
+
103
+ for impacted_package in vuln_info .impacted_packages :
104
+ ip = models .ImpactedPackage (
105
+ vulnerability = vuln_model ,
106
+ package = impacted_package_models [impacted_package .package_url ]
107
+ )
108
+ impacted_refs .append (ip )
109
+
110
+ for resolved_package in vuln_info .resolved_packages :
111
+ ip = models .ResolvedPackage (
112
+ vulnerability = vuln_model ,
113
+ package = resolved_package_models [resolved_package .package_url ]
114
+ )
115
+ resolved_refs .append (ip )
116
+
117
+ models .ImpactedPackage .objects .bulk_create (impacted_refs )
118
+ models .ResolvedPackage .objects .bulk_create (resolved_refs )
119
+
120
+
121
+ @transaction .atomic
122
+ def _insert_vulnerabilities_and_references (batch : Sequence [VulnerabilityInfo ]) -> Set [models .Vulnerability ]:
123
+ """
124
+ TODO Consider refactoring to use bulk_create() and avoid get_or_create() when possible.
125
+ """
126
+ vulnerabilities = set ()
127
+
128
+ for vuln_info in batch :
129
+ vuln : models .Vulnerability
130
+
131
+ if vuln_info .cve_id :
132
+ vuln , created = models .Vulnerability .objects .get_or_create (cve_id = vuln_info .cve_id )
133
+ if created :
134
+ vuln .summary = vuln_info .summary
135
+ vuln .save ()
136
+ else :
137
+ # FIXME Currently there is no way to check whether a vulnerability without a CVE ID already exists in the
138
+ # FIXME database.
139
+ vuln = models .Vulnerability .objects .create (summary = vuln_info .summary )
140
+
141
+ vulnerabilities .add (vuln )
142
+
143
+ for url in vuln_info .references :
144
+ try :
145
+ models .VulnerabilityReference .objects .create (vulnerability = vuln , url = url )
146
+ except IntegrityError :
147
+ # This vulnerability reference already exists, nothing to do.
148
+ # TODO Find a more efficient way to do this rather than trying and ignoring any errors.
149
+ pass
150
+
151
+ return vulnerabilities
152
+
153
+
154
+ def _vuln_info_to_vuln_model (
155
+ vuln_info : VulnerabilityInfo ,
156
+ vulnerability_models : Set [models .Vulnerability ]
157
+ ) -> models .Vulnerability :
158
+
159
+ for v in vulnerability_models :
160
+ if vuln_info .cve_id and vuln_info .cve_id == v .cve_id :
161
+ return v
162
+
163
+ if vuln_info .summary == v .summary :
164
+ return v
165
+
166
+ raise RuntimeError (f'No Vulnerability model object found for this VulnerabilityInfo: { vuln_info .summary } ' )
167
+
168
+
169
+ def _collect_packages_from_batch (batch : Sequence [VulnerabilityInfo ]) -> Tuple [Set [Package ], Set [Package ]]:
170
+ impacted , resolved = set (), set ()
171
+
172
+ for vuln_info in batch :
173
+ impacted .update (vuln_info .impacted_packages )
174
+ resolved .update (vuln_info .resolved_packages )
175
+
176
+ return impacted , resolved
177
+
178
+
179
+ def _bulk_insert_packages (
180
+ impacted : Set [Package ],
181
+ resolved : Set [Package ]
182
+ ) -> Tuple [Mapping [str , Package ], Mapping [str , Package ]]:
183
+
184
+ pkg_models = [_to_package_model (p ) for p in impacted .union (resolved )]
185
+ pkg_models = models .Package .objects .bulk_create (pkg_models )
186
+
187
+ impacted_purls = {p .package_url for p in impacted }
188
+ resolved_purls = {p .package_url for p in resolved }
189
+
190
+ impacted_models , resolved_models = {}, {}
191
+
192
+ for pkg_model in pkg_models :
193
+ purl = pkg_model .package_url
194
+
195
+ if purl in impacted_purls :
196
+ impacted_models [purl ] = pkg_model
197
+ elif purl in resolved_purls :
198
+ resolved_models [purl ] = pkg_model
199
+
200
+ return impacted_models , resolved_models
201
+
202
+
203
+ def _to_package_model (pkg : Package ) -> models .Package :
204
+ return models .Package (
205
+ name = pkg .name ,
206
+ type = pkg .type ,
207
+ version = pkg .version ,
208
+ namespace = pkg .namespace ,
209
+ qualifiers = pkg .qualifiers ,
210
+ subpath = pkg .subpath ,
211
+ )
0 commit comments