Skip to content

Implement package assembly in scancode.io #485

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Aug 25, 2022
16 changes: 16 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,22 @@ v31.0.0 (next)
"PACKAGES", "RESOURCES", "DEPENDENCIES", and "ERRORS" names.
https://github.com/nexB/scancode.io/issues/511

- Update application Package scanning step to reflect the updates in
scancode-toolkit package scanning.

- Package data detected from a file are now stored on the
CodebaseResource.package_data field.
- A second processing step is now done after scanning for Package data, where
Package Resources are determined and DiscoveredPackages are created.

https://github.com/nexB/scancode.io/issues/444

- ``CodebaseResource.for_packages`` now returns a list of
``DiscoveredPackage.package_uid`` or ``DiscoveredPackage.package_url`` if
``DiscoveredPackage.package_uid`` is not present. This is done to reflect the
how scancode-toolkit's JSON output returns ``package_uid``s in the
``for_packages`` field for Resources.

v30.2.0 (2021-12-17)
--------------------

Expand Down
18 changes: 18 additions & 0 deletions scanpipe/migrations/0021_codebaseresource_package_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.0.6 on 2022-08-09 18:18

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('scanpipe', '0020_alter_codebaseresource_name'),
]

operations = [
migrations.AddField(
model_name='codebaseresource',
name='package_data',
field=models.JSONField(blank=True, default=list, help_text='List of Package data detected from this CodebaseResource'),
),
]
68 changes: 66 additions & 2 deletions scanpipe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
import django_rq
import redis
import requests
from commoncode.fileutils import parent_directory
from commoncode.hash import multi_checksums
from packageurl import PackageURL
from packageurl import normalize_qualifiers
Expand Down Expand Up @@ -883,6 +884,14 @@ def error_count(self):
"""
return self.projecterrors.count()

@cached_property
def has_single_resource(self):
"""
Return True if we only have a single CodebaseResource associated to this
project, False otherwise.
"""
return self.codebaseresources.count() == 1


class ProjectRelatedQuerySet(models.QuerySet):
def project(self, project):
Expand Down Expand Up @@ -1279,6 +1288,9 @@ def has_licenses(self):
def has_no_licenses(self):
return self.filter(licenses=[])

def has_package_data(self):
return self.filter(~Q(package_data=[]))

def licenses_categories(self, categories):
return self.json_list_contains(
field_name="licenses",
Expand Down Expand Up @@ -1496,6 +1508,11 @@ class Compliance(models.TextChoices):
"provided policies."
),
)
package_data = models.JSONField(
default=list,
blank=True,
help_text=_("List of Package data detected from this CodebaseResource"),
)

objects = CodebaseResourceQuerySet.as_manager()

Expand All @@ -1519,11 +1536,14 @@ def from_db(cls, db, field_names, values):

return new

def save(self, *args, **kwargs):
def save(self, codebase=None, *args, **kwargs):
"""
Saves the current resource instance.
Injects policies—if the feature is enabled—when the `licenses` field value is
changed.

`codebase` is not used in this context but required for compatibility
with the commoncode.resource.Codebase class API.
"""
if scanpipe_app.policies_enabled:
loaded_licenses = getattr(self, "loaded_licenses", [])
Expand Down Expand Up @@ -1613,6 +1633,47 @@ def unique_license_expressions(self):
"""
return sorted(set(self.license_expressions))

def parent_path(self):
"""
Return the parent path for this CodebaseResource or None.
"""
return parent_directory(self.path, with_trail=False)

def has_parent(self):
"""
Return True if this CodebaseResource has a parent CodebaseResource or
False otherwise.
"""
parent_path = self.parent_path()
if not parent_path:
return False
if self.project.codebaseresources.filter(path=parent_path).exists():
return True
return False

def parent(self, codebase=None):
"""
Return the parent CodebaseResource object for this CodebaseResource or
None.

`codebase` is not used in this context but required for compatibility
with the commoncode.resource.Codebase class API.
"""
parent_path = self.parent_path()
return parent_path and self.project.codebaseresources.get(path=parent_path)

def siblings(self, codebase=None):
"""
Return a sequence of sibling Resource objects for this Resource
or an empty sequence.

`codebase` is not used in this context but required for compatibility
with the commoncode.resource.Codebase class API.
"""
if self.has_parent():
return self.parent(codebase).children(codebase)
return []

def descendants(self):
"""
Returns a QuerySet of descendant CodebaseResource objects using a
Expand Down Expand Up @@ -1729,7 +1790,10 @@ def for_packages(self):
"""
Returns the list of all discovered packages associated to this resource.
"""
return [str(package) for package in self.discovered_packages.all()]
return [
package.package_uid if package.package_uid else str(package)
for package in self.discovered_packages.all()
]


class DiscoveredPackageQuerySet(PackageURLQuerySetMixin, ProjectRelatedQuerySet):
Expand Down
2 changes: 1 addition & 1 deletion scanpipe/pipes/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def create_codebase_resources(project, image):
for layer_index, layer in enumerate(image.layers, start=1):
layer_tag = get_layer_tag(image.image_id, layer.layer_id, layer_index)

for resource in layer.get_resources():
for resource in layer.get_resources(with_dir=True):
pipes.make_codebase_resource(
project=project,
location=resource.location,
Expand Down
78 changes: 71 additions & 7 deletions scanpipe/pipes/scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
from commoncode import fileutils
from commoncode.resource import VirtualCodebase
from extractcode import api as extractcode_api
from packagedcode import get_package_handler
from packagedcode import models as packagedcode_models
from scancode import ScancodeError
from scancode import Scanner
from scancode import api as scancode_api
Expand Down Expand Up @@ -230,10 +232,9 @@ def save_scan_package_results(codebase_resource, scan_results, scan_errors):
Saves the resource scan package results in the database.
Creates project errors if any occurred during the scan.
"""
packages = scan_results.get("package_data", [])
if packages:
for package_data in packages:
codebase_resource.create_and_add_package(package_data)
package_data = scan_results.get("package_data", [])
if package_data:
codebase_resource.package_data = package_data
codebase_resource.status = "application-package"
codebase_resource.save()

Expand Down Expand Up @@ -310,18 +311,81 @@ def scan_for_files(project):

def scan_for_application_packages(project):
"""
Runs a package scan on files without a status for a `project`.
Runs a package scan on files without a status for a `project`, then create
DiscoveredPackage instances from the detected package data.

Multiprocessing is enabled by default on this pipe, the number of processes can be
controlled through the SCANCODEIO_PROCESSES setting.
Multiprocessing is enabled by default on this pipe, the number of processes
can be controlled through the SCANCODEIO_PROCESSES setting.
"""
resource_qs = project.codebaseresources.no_status()

# Collect detected Package data and save it to the CodebaseResource it was
# detected from.
_scan_and_save(
resource_qs=resource_qs,
scan_func=scan_for_package_data,
save_func=save_scan_package_results,
)

# Iterate through CodebaseResources with Package data and handle them using
# the proper Package handler from packagedcode.
assemble_packages(project=project)


def add_to_package(package_uid, resource, project):
"""
Relate a DiscoveredPackage to `resource` from `project` using `package_uid`.
"""
if not package_uid:
return

resource_package = resource.discovered_packages.filter(package_uid=package_uid)
if not resource_package.exists():
package = project.discoveredpackages.get(package_uid=package_uid)
resource.discovered_packages.add(package)


def assemble_packages(project):
"""
Create instances of DiscoveredPackage and DiscoveredDependency for `project`
from the parsed package data present in the CodebaseResources of `project`.
"""
logger.info(f"Project {project} assemble_packages:")
seen_resource_paths = set()

for resource in project.codebaseresources.has_package_data():
if resource.path in seen_resource_paths:
continue

logger.info(f" Processing: {resource.path}")
for package_mapping in resource.package_data:
pd = packagedcode_models.PackageData.from_dict(mapping=package_mapping)
logger.info(f" Package data: {pd.purl}")

handler = get_package_handler(pd)
logger.info(f" Selected package handler: {handler.__name__}")

items = handler.assemble(
package_data=pd,
resource=resource,
codebase=project,
package_adder=add_to_package,
)

for item in items:
logger.info(f" Processing item: {item}")
if isinstance(item, packagedcode_models.Package):
package_data = item.to_dict()
pipes.update_or_create_package(project, package_data)
elif isinstance(item, packagedcode_models.Dependency):
# We will handle Dependencies when we properly implement the
# DiscoveredDependency model
pass
elif isinstance(item, CodebaseResource):
seen_resource_paths.add(item.path)
else:
logger.info(f"Unknown Package assembly item type: {item!r}")


def run_scancode(location, output_file, options, raise_on_error=False):
"""
Expand Down
Loading