Skip to content

Update DatafileHandler default methods #3042

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ Important API changes:
under the ``venv`` subdirectory. You mus be aware of this if you use ScanCode
from a git clone

- ``DatafileHandler.assemble()``, ``DatafileHandler.assemble_from_many()``, and
the other ``.assemble()``` methods from the other Package handlers from
packagedcode, have been updated to yield Package items before Dependency or
Resource items. This is particulary important in the case where we are calling
the ``assemble()`` method outside of the scancode-toolkit context, where we
need to ensure that a Package exists before we assocate a Resource or
Dependency to it.

Copyright detection:
~~~~~~~~~~~~~~~~~~~~
Expand Down
19 changes: 9 additions & 10 deletions src/packagedcode/alpine.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,6 @@ def assemble(cls, package_data, resource, codebase, package_adder):

package.license_expression = cls.compute_normalized_license(package)


dependent_packages = package_data.dependencies
if dependent_packages:
yield from models.Dependency.from_dependent_packages(
dependent_packages=dependent_packages,
datafile_path=resource.path,
datasource_id=package_data.datasource_id,
package_uid=package_uid,
)

root_path = Path(root_resource.path)
# a file ref extends from the root of the filesystem
file_references_by_path = {
Expand Down Expand Up @@ -118,6 +108,15 @@ def assemble(cls, package_data, resource, codebase, package_adder):
yield package
yield from resources

dependent_packages = package_data.dependencies
if dependent_packages:
yield from models.Dependency.from_dependent_packages(
dependent_packages=dependent_packages,
datafile_path=resource.path,
datasource_id=package_data.datasource_id,
package_uid=package_uid,
)


class AlpineApkbuildHandler(models.DatafileHandler):
datasource_id = 'alpine_apkbuild'
Expand Down
28 changes: 18 additions & 10 deletions src/packagedcode/debian.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,14 +241,18 @@ def assemble(cls, package_data, resource, codebase, package_adder):
package_file_references.extend(package_data.file_references)
package_uid = package.package_uid

dependencies = []
dependent_packages = package_data.dependencies
if dependent_packages:
yield from models.Dependency.from_dependent_packages(
dependent_packages=dependent_packages,
datafile_path=resource.path,
datasource_id=package_data.datasource_id,
package_uid=package_uid,
deps = list(
models.Dependency.from_dependent_packages(
dependent_packages=dependent_packages,
datafile_path=resource.path,
datasource_id=package_data.datasource_id,
package_uid=package_uid,
)
)
dependencies.extend(deps)

# Multi-Arch can be: "foreign", "same", "allowed", "all", "optional" or
# empty/non-present. See https://wiki.debian.org/Multiarch/HOWTO
Expand Down Expand Up @@ -312,12 +316,15 @@ def assemble(cls, package_data, resource, codebase, package_adder):
# yield possible dependencies
dependent_packages = package_data.dependencies
if dependent_packages:
yield from models.Dependency.from_dependent_packages(
dependent_packages=dependent_packages,
datafile_path=res.path,
datasource_id=package_data.datasource_id,
package_uid=package_uid,
deps = list(
models.Dependency.from_dependent_packages(
dependent_packages=dependent_packages,
datafile_path=res.path,
datasource_id=package_data.datasource_id,
package_uid=package_uid,
)
)
dependencies.extend(deps)

resources.append(res)

Expand Down Expand Up @@ -353,6 +360,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):

yield package
yield from resources
yield from dependencies


class DebianDistrolessInstalledDatabaseHandler(models.DatafileHandler):
Expand Down
52 changes: 39 additions & 13 deletions src/packagedcode/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,12 @@ def assemble(cls, package_data, resource, codebase, package_adder=add_to_package
not be further processed,
- a Dependency to add to top-level dependencies

Package items must be yielded before Dependency or Resource items. This
is to ensure that a Package is created before we associate a Resource or
Dependency to a Package. This is particulary important in the case where
we are calling the `assemble()` method outside of the scancode-toolkit
context.

The approach is to find and process all the neighboring related datafiles
to this datafile at once.

Expand All @@ -938,14 +944,14 @@ def assemble(cls, package_data, resource, codebase, package_adder=add_to_package
if not package.license_expression:
package.license_expression = cls.compute_normalized_license(package)

yield package

cls.assign_package_to_resources(
package=package,
resource=resource,
codebase=codebase,
package_adder=package_adder,
)

yield package
else:
# we have no package, so deps are not for a specific package uid
package_uid = None
Expand Down Expand Up @@ -1038,6 +1044,13 @@ def assemble_from_many(cls, pkgdata_resources, codebase, package_adder=add_to_pa
This is a convenience method that subclasses can reuse when overriding
`assemble()`

Like in ``DatafileHandler.assemble()``, Package items must be yielded
before Dependency or Resource items. This is to ensure that a Package is
created before we associate a Resource or Dependency to a Package. This
is particulary important in the case where we are calling the
``assemble()`` method outside of the scancode-toolkit context, as
``assemble()`` can call ``assemble_from_many()``.

NOTE: ATTENTION!: this may not work well for datafile that yield
multiple PackageData for unrelated Packages
"""
Expand All @@ -1047,6 +1060,12 @@ def assemble_from_many(cls, pkgdata_resources, codebase, package_adder=add_to_pa

# process each package in sequence. The first item creates a package and
# the other only update
# We are saving the Packages, Dependencies, and Resources in lists until
# after we go through `pkgdata_resources` for all Package data, then we
# yield Packages, then Dependencies, then Resources.
dependencies = []
resources = []
resources_from_package = []
for package_data, resource in pkgdata_resources:
if not base_resource:
base_resource = resource
Expand All @@ -1059,8 +1078,6 @@ def assemble_from_many(cls, pkgdata_resources, codebase, package_adder=add_to_pa
datafile_path=resource.path,
)
package_uid = package.package_uid
if package_uid:
package_adder(package_uid, resource, codebase)
else:
# FIXME: What is the package_data is NOT for the same package as package?
# FIXME: What if the update did not do anything? (it does return True or False)
Expand All @@ -1069,31 +1086,40 @@ def assemble_from_many(cls, pkgdata_resources, codebase, package_adder=add_to_pa
package_data=package_data,
datafile_path=resource.path,
)
if package_uid:
package_adder(package_uid, resource, codebase)

if package_uid:
resources_from_package.append((package_uid, resource,))

# in all cases yield possible dependencies
dependent_packages = package_data.dependencies
if dependent_packages:
yield from Dependency.from_dependent_packages(
p_deps = Dependency.from_dependent_packages(
dependent_packages=dependent_packages,
datafile_path=resource.path,
datasource_id=package_data.datasource_id,
package_uid=package_uid,
)
dependencies.extend(list(p_deps))

# we yield this as we do not want this further processed
yield resource

# the whole parent subtree of the base_resource is for this package
if package_uid:
for res in base_resource.walk(codebase):
package_adder(package_uid, res, codebase)
resources.append(resource)

# Yield Packages, Dependencies, and Resources
if package:
if not package.license_expression:
package.license_expression = cls.compute_normalized_license(package)
yield package
yield from dependencies
yield from resources

# Associate Package to Resources once they have been yielded
for package_uid, resource in resources_from_package:
package_adder(package_uid, resource, codebase)

# the whole parent subtree of the base_resource is for this package
if package_uid:
for res in base_resource.walk(codebase):
package_adder(package_uid, res, codebase)

@classmethod
def assemble_from_many_datafiles(
Expand Down