Skip to content

Commit a69c563

Browse files
JonoYangtdruez
andauthored
Implement package assembly in scancode.io #444 (#485)
* Implement package assembly in scancode.io #447 Signed-off-by: Jono Yang <[email protected]> * Minor formatting changes for consistency #447 Signed-off-by: Thomas Druez <[email protected]> * Create DiscoveredPackages before other models #447 Signed-off-by: Jono Yang <[email protected]> * Revert "Create DiscoveredPackages before other models #447" This reverts commit c9b8bed. Sorting Packages, Dependencies, and Resources from DatafileHandler.assemble() will never work. The code needs to be changed in scancode-toolkit. Signed-off-by: Jono Yang <[email protected]> * Update migration #444 Signed-off-by: Jono Yang <[email protected]> * Return package_uids in for_packages #444 * This is so we are consistent with scancode-toolkit JSON output * Update expected test results Signed-off-by: Jono Yang <[email protected]> * Add test for assemble_packages #444 Signed-off-by: Jono Yang <[email protected]> * Update has_package_data filter logic #444 Signed-off-by: Jono Yang <[email protected]> * Create directory Resources in docker pipeline #485 * Update test expectations Signed-off-by: Jono Yang <[email protected]> * Bump scancode-toolkit and commoncode #485 Signed-off-by: Jono Yang <[email protected]> * Add test for pypi wheel #485 Signed-off-by: Jono Yang <[email protected]> Signed-off-by: Jono Yang <[email protected]> Signed-off-by: Thomas Druez <[email protected]> Co-authored-by: Thomas Druez <[email protected]>
1 parent 9bdd94f commit a69c563

27 files changed

+16301
-4751
lines changed

CHANGELOG.rst

+16
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,22 @@ v31.0.0 (next)
9393
"PACKAGES", "RESOURCES", "DEPENDENCIES", and "ERRORS" names.
9494
https://github.com/nexB/scancode.io/issues/511
9595

96+
- Update application Package scanning step to reflect the updates in
97+
scancode-toolkit package scanning.
98+
99+
- Package data detected from a file are now stored on the
100+
CodebaseResource.package_data field.
101+
- A second processing step is now done after scanning for Package data, where
102+
Package Resources are determined and DiscoveredPackages are created.
103+
104+
https://github.com/nexB/scancode.io/issues/444
105+
106+
- ``CodebaseResource.for_packages`` now returns a list of
107+
``DiscoveredPackage.package_uid`` or ``DiscoveredPackage.package_url`` if
108+
``DiscoveredPackage.package_uid`` is not present. This is done to reflect the
109+
how scancode-toolkit's JSON output returns ``package_uid``s in the
110+
``for_packages`` field for Resources.
111+
96112
v30.2.0 (2021-12-17)
97113
--------------------
98114

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Generated by Django 4.0.6 on 2022-08-09 18:18
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('scanpipe', '0020_alter_codebaseresource_name'),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name='codebaseresource',
15+
name='package_data',
16+
field=models.JSONField(blank=True, default=list, help_text='List of Package data detected from this CodebaseResource'),
17+
),
18+
]

scanpipe/models.py

+66-2
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
import django_rq
5959
import redis
6060
import requests
61+
from commoncode.fileutils import parent_directory
6162
from commoncode.hash import multi_checksums
6263
from packageurl import PackageURL
6364
from packageurl import normalize_qualifiers
@@ -883,6 +884,14 @@ def error_count(self):
883884
"""
884885
return self.projecterrors.count()
885886

887+
@cached_property
888+
def has_single_resource(self):
889+
"""
890+
Return True if we only have a single CodebaseResource associated to this
891+
project, False otherwise.
892+
"""
893+
return self.codebaseresources.count() == 1
894+
886895

887896
class ProjectRelatedQuerySet(models.QuerySet):
888897
def project(self, project):
@@ -1279,6 +1288,9 @@ def has_licenses(self):
12791288
def has_no_licenses(self):
12801289
return self.filter(licenses=[])
12811290

1291+
def has_package_data(self):
1292+
return self.filter(~Q(package_data=[]))
1293+
12821294
def licenses_categories(self, categories):
12831295
return self.json_list_contains(
12841296
field_name="licenses",
@@ -1496,6 +1508,11 @@ class Compliance(models.TextChoices):
14961508
"provided policies."
14971509
),
14981510
)
1511+
package_data = models.JSONField(
1512+
default=list,
1513+
blank=True,
1514+
help_text=_("List of Package data detected from this CodebaseResource"),
1515+
)
14991516

15001517
objects = CodebaseResourceQuerySet.as_manager()
15011518

@@ -1519,11 +1536,14 @@ def from_db(cls, db, field_names, values):
15191536

15201537
return new
15211538

1522-
def save(self, *args, **kwargs):
1539+
def save(self, codebase=None, *args, **kwargs):
15231540
"""
15241541
Saves the current resource instance.
15251542
Injects policies—if the feature is enabled—when the `licenses` field value is
15261543
changed.
1544+
1545+
`codebase` is not used in this context but required for compatibility
1546+
with the commoncode.resource.Codebase class API.
15271547
"""
15281548
if scanpipe_app.policies_enabled:
15291549
loaded_licenses = getattr(self, "loaded_licenses", [])
@@ -1613,6 +1633,47 @@ def unique_license_expressions(self):
16131633
"""
16141634
return sorted(set(self.license_expressions))
16151635

1636+
def parent_path(self):
1637+
"""
1638+
Return the parent path for this CodebaseResource or None.
1639+
"""
1640+
return parent_directory(self.path, with_trail=False)
1641+
1642+
def has_parent(self):
1643+
"""
1644+
Return True if this CodebaseResource has a parent CodebaseResource or
1645+
False otherwise.
1646+
"""
1647+
parent_path = self.parent_path()
1648+
if not parent_path:
1649+
return False
1650+
if self.project.codebaseresources.filter(path=parent_path).exists():
1651+
return True
1652+
return False
1653+
1654+
def parent(self, codebase=None):
1655+
"""
1656+
Return the parent CodebaseResource object for this CodebaseResource or
1657+
None.
1658+
1659+
`codebase` is not used in this context but required for compatibility
1660+
with the commoncode.resource.Codebase class API.
1661+
"""
1662+
parent_path = self.parent_path()
1663+
return parent_path and self.project.codebaseresources.get(path=parent_path)
1664+
1665+
def siblings(self, codebase=None):
1666+
"""
1667+
Return a sequence of sibling Resource objects for this Resource
1668+
or an empty sequence.
1669+
1670+
`codebase` is not used in this context but required for compatibility
1671+
with the commoncode.resource.Codebase class API.
1672+
"""
1673+
if self.has_parent():
1674+
return self.parent(codebase).children(codebase)
1675+
return []
1676+
16161677
def descendants(self):
16171678
"""
16181679
Returns a QuerySet of descendant CodebaseResource objects using a
@@ -1729,7 +1790,10 @@ def for_packages(self):
17291790
"""
17301791
Returns the list of all discovered packages associated to this resource.
17311792
"""
1732-
return [str(package) for package in self.discovered_packages.all()]
1793+
return [
1794+
package.package_uid if package.package_uid else str(package)
1795+
for package in self.discovered_packages.all()
1796+
]
17331797

17341798

17351799
class DiscoveredPackageQuerySet(PackageURLQuerySetMixin, ProjectRelatedQuerySet):

scanpipe/pipes/docker.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def create_codebase_resources(project, image):
173173
for layer_index, layer in enumerate(image.layers, start=1):
174174
layer_tag = get_layer_tag(image.image_id, layer.layer_id, layer_index)
175175

176-
for resource in layer.get_resources():
176+
for resource in layer.get_resources(with_dir=True):
177177
pipes.make_codebase_resource(
178178
project=project,
179179
location=resource.location,

scanpipe/pipes/scancode.py

+71-7
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
from commoncode import fileutils
3737
from commoncode.resource import VirtualCodebase
3838
from extractcode import api as extractcode_api
39+
from packagedcode import get_package_handler
40+
from packagedcode import models as packagedcode_models
3941
from scancode import ScancodeError
4042
from scancode import Scanner
4143
from scancode import api as scancode_api
@@ -230,10 +232,9 @@ def save_scan_package_results(codebase_resource, scan_results, scan_errors):
230232
Saves the resource scan package results in the database.
231233
Creates project errors if any occurred during the scan.
232234
"""
233-
packages = scan_results.get("package_data", [])
234-
if packages:
235-
for package_data in packages:
236-
codebase_resource.create_and_add_package(package_data)
235+
package_data = scan_results.get("package_data", [])
236+
if package_data:
237+
codebase_resource.package_data = package_data
237238
codebase_resource.status = "application-package"
238239
codebase_resource.save()
239240

@@ -310,18 +311,81 @@ def scan_for_files(project):
310311

311312
def scan_for_application_packages(project):
312313
"""
313-
Runs a package scan on files without a status for a `project`.
314+
Runs a package scan on files without a status for a `project`, then create
315+
DiscoveredPackage instances from the detected package data.
314316
315-
Multiprocessing is enabled by default on this pipe, the number of processes can be
316-
controlled through the SCANCODEIO_PROCESSES setting.
317+
Multiprocessing is enabled by default on this pipe, the number of processes
318+
can be controlled through the SCANCODEIO_PROCESSES setting.
317319
"""
318320
resource_qs = project.codebaseresources.no_status()
321+
322+
# Collect detected Package data and save it to the CodebaseResource it was
323+
# detected from.
319324
_scan_and_save(
320325
resource_qs=resource_qs,
321326
scan_func=scan_for_package_data,
322327
save_func=save_scan_package_results,
323328
)
324329

330+
# Iterate through CodebaseResources with Package data and handle them using
331+
# the proper Package handler from packagedcode.
332+
assemble_packages(project=project)
333+
334+
335+
def add_to_package(package_uid, resource, project):
336+
"""
337+
Relate a DiscoveredPackage to `resource` from `project` using `package_uid`.
338+
"""
339+
if not package_uid:
340+
return
341+
342+
resource_package = resource.discovered_packages.filter(package_uid=package_uid)
343+
if not resource_package.exists():
344+
package = project.discoveredpackages.get(package_uid=package_uid)
345+
resource.discovered_packages.add(package)
346+
347+
348+
def assemble_packages(project):
349+
"""
350+
Create instances of DiscoveredPackage and DiscoveredDependency for `project`
351+
from the parsed package data present in the CodebaseResources of `project`.
352+
"""
353+
logger.info(f"Project {project} assemble_packages:")
354+
seen_resource_paths = set()
355+
356+
for resource in project.codebaseresources.has_package_data():
357+
if resource.path in seen_resource_paths:
358+
continue
359+
360+
logger.info(f" Processing: {resource.path}")
361+
for package_mapping in resource.package_data:
362+
pd = packagedcode_models.PackageData.from_dict(mapping=package_mapping)
363+
logger.info(f" Package data: {pd.purl}")
364+
365+
handler = get_package_handler(pd)
366+
logger.info(f" Selected package handler: {handler.__name__}")
367+
368+
items = handler.assemble(
369+
package_data=pd,
370+
resource=resource,
371+
codebase=project,
372+
package_adder=add_to_package,
373+
)
374+
375+
for item in items:
376+
logger.info(f" Processing item: {item}")
377+
if isinstance(item, packagedcode_models.Package):
378+
package_data = item.to_dict()
379+
pipes.update_or_create_package(project, package_data)
380+
elif isinstance(item, packagedcode_models.Dependency):
381+
# We will handle Dependencies when we properly implement the
382+
# DiscoveredDependency model
383+
pass
384+
elif isinstance(item, CodebaseResource):
385+
seen_resource_paths.add(item.path)
386+
else:
387+
logger.info(f"Unknown Package assembly item type: {item!r}")
388+
325389

326390
def run_scancode(location, output_file, options, raise_on_error=False):
327391
"""

0 commit comments

Comments
 (0)