From 5ca30b14c5bcef8023663e9cec5d9a11361eb2c5 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 4 Aug 2022 13:58:20 -0700 Subject: [PATCH 1/3] Update DatafileHandler default methods * Yield Packages and Dependencies before associating Packages to Resources Signed-off-by: Jono Yang --- src/packagedcode/models.py | 39 +++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/src/packagedcode/models.py b/src/packagedcode/models.py index 8c8b1d4631c..dc4dc8b810e 100644 --- a/src/packagedcode/models.py +++ b/src/packagedcode/models.py @@ -938,14 +938,14 @@ def assemble(cls, package_data, resource, codebase, package_adder=add_to_package if not package.license_expression: package.license_expression = cls.compute_normalized_license(package) + yield package + cls.assign_package_to_resources( package=package, resource=resource, codebase=codebase, package_adder=package_adder, ) - - yield package else: # we have no package, so deps are not for a specific package uid package_uid = None @@ -1047,6 +1047,12 @@ def assemble_from_many(cls, pkgdata_resources, codebase, package_adder=add_to_pa # process each package in sequence. The first item creates a package and # the other only update + # We are saving the Packages, Dependencies, and Resources in lists until + # after we go through `pkgdata_resources` for all Package data, then we + # yield Packages, then Dependencies, then Resources. + dependencies = [] + resources = [] + resources_from_package = [] for package_data, resource in pkgdata_resources: if not base_resource: base_resource = resource @@ -1059,8 +1065,6 @@ def assemble_from_many(cls, pkgdata_resources, codebase, package_adder=add_to_pa datafile_path=resource.path, ) package_uid = package.package_uid - if package_uid: - package_adder(package_uid, resource, codebase) else: # FIXME: What is the package_data is NOT for the same package as package? # FIXME: What if the update did not do anything? (it does return True or False) @@ -1069,31 +1073,40 @@ def assemble_from_many(cls, pkgdata_resources, codebase, package_adder=add_to_pa package_data=package_data, datafile_path=resource.path, ) - if package_uid: - package_adder(package_uid, resource, codebase) + + if package_uid: + resources_from_package.append((package_uid, resource,)) # in all cases yield possible dependencies dependent_packages = package_data.dependencies if dependent_packages: - yield from Dependency.from_dependent_packages( + p_deps = Dependency.from_dependent_packages( dependent_packages=dependent_packages, datafile_path=resource.path, datasource_id=package_data.datasource_id, package_uid=package_uid, ) + dependencies.extend(list(p_deps)) # we yield this as we do not want this further processed - yield resource - - # the whole parent subtree of the base_resource is for this package - if package_uid: - for res in base_resource.walk(codebase): - package_adder(package_uid, res, codebase) + resources.append(resource) + # Yield Packages, Dependencies, and Resources if package: if not package.license_expression: package.license_expression = cls.compute_normalized_license(package) yield package + yield from dependencies + yield from resources + + # Associate Package to Resources once they have been yielded + for package_uid, resource in resources_from_package: + package_adder(package_uid, resource, codebase) + + # the whole parent subtree of the base_resource is for this package + if package_uid: + for res in base_resource.walk(codebase): + package_adder(package_uid, res, codebase) @classmethod def assemble_from_many_datafiles( From dc69bc6c868c14082d526ea46951e04b59be6433 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 11 Aug 2022 11:34:40 -0700 Subject: [PATCH 2/3] Update doc and comments Signed-off-by: Jono Yang --- CHANGELOG.rst | 7 +++++++ src/packagedcode/models.py | 13 +++++++++++++ 2 files changed, 20 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 18500e422da..15a4fe64f07 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -60,6 +60,13 @@ Important API changes: under the ``venv`` subdirectory. You mus be aware of this if you use ScanCode from a git clone +- ``DatafileHandler.assemble()``, ``DatafileHandler.assemble_from_many()``, and + the other ``.assemble()``` methods from the other Package handlers from + packagedcode, have been updated to yield Package items before Dependency or + Resource items. This is particulary important in the case where we are calling + the ``assemble()`` method outside of the scancode-toolkit context, where we + need to ensure that a Package exists before we assocate a Resource or + Dependency to it. Copyright detection: ~~~~~~~~~~~~~~~~~~~~ diff --git a/src/packagedcode/models.py b/src/packagedcode/models.py index dc4dc8b810e..f63e109a6c2 100644 --- a/src/packagedcode/models.py +++ b/src/packagedcode/models.py @@ -914,6 +914,12 @@ def assemble(cls, package_data, resource, codebase, package_adder=add_to_package not be further processed, - a Dependency to add to top-level dependencies + Package items must be yielded before Dependency or Resource items. This + is to ensure that a Package is created before we associate a Resource or + Dependency to a Package. This is particulary important in the case where + we are calling the `assemble()` method outside of the scancode-toolkit + context. + The approach is to find and process all the neighboring related datafiles to this datafile at once. @@ -1038,6 +1044,13 @@ def assemble_from_many(cls, pkgdata_resources, codebase, package_adder=add_to_pa This is a convenience method that subclasses can reuse when overriding `assemble()` + Like in ``DatafileHandler.assemble()``, Package items must be yielded + before Dependency or Resource items. This is to ensure that a Package is + created before we associate a Resource or Dependency to a Package. This + is particulary important in the case where we are calling the + ``assemble()`` method outside of the scancode-toolkit context, as + ``assemble()`` can call ``assemble_from_many()``. + NOTE: ATTENTION!: this may not work well for datafile that yield multiple PackageData for unrelated Packages """ From 78547bee5c9f955a850ab75315e332c7eae56d1f Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 11 Aug 2022 12:47:57 -0700 Subject: [PATCH 3/3] Update assemble methods * Update assemble methods for alpine and debian to yield Packages first before other items Signed-off-by: Jono Yang --- src/packagedcode/alpine.py | 19 +++++++++---------- src/packagedcode/debian.py | 28 ++++++++++++++++++---------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/src/packagedcode/alpine.py b/src/packagedcode/alpine.py index 9dca74f3b9e..b7238556a18 100644 --- a/src/packagedcode/alpine.py +++ b/src/packagedcode/alpine.py @@ -81,16 +81,6 @@ def assemble(cls, package_data, resource, codebase, package_adder): package.license_expression = cls.compute_normalized_license(package) - - dependent_packages = package_data.dependencies - if dependent_packages: - yield from models.Dependency.from_dependent_packages( - dependent_packages=dependent_packages, - datafile_path=resource.path, - datasource_id=package_data.datasource_id, - package_uid=package_uid, - ) - root_path = Path(root_resource.path) # a file ref extends from the root of the filesystem file_references_by_path = { @@ -118,6 +108,15 @@ def assemble(cls, package_data, resource, codebase, package_adder): yield package yield from resources + dependent_packages = package_data.dependencies + if dependent_packages: + yield from models.Dependency.from_dependent_packages( + dependent_packages=dependent_packages, + datafile_path=resource.path, + datasource_id=package_data.datasource_id, + package_uid=package_uid, + ) + class AlpineApkbuildHandler(models.DatafileHandler): datasource_id = 'alpine_apkbuild' diff --git a/src/packagedcode/debian.py b/src/packagedcode/debian.py index 8a65bb72b2c..d8959c52acd 100644 --- a/src/packagedcode/debian.py +++ b/src/packagedcode/debian.py @@ -241,14 +241,18 @@ def assemble(cls, package_data, resource, codebase, package_adder): package_file_references.extend(package_data.file_references) package_uid = package.package_uid + dependencies = [] dependent_packages = package_data.dependencies if dependent_packages: - yield from models.Dependency.from_dependent_packages( - dependent_packages=dependent_packages, - datafile_path=resource.path, - datasource_id=package_data.datasource_id, - package_uid=package_uid, + deps = list( + models.Dependency.from_dependent_packages( + dependent_packages=dependent_packages, + datafile_path=resource.path, + datasource_id=package_data.datasource_id, + package_uid=package_uid, + ) ) + dependencies.extend(deps) # Multi-Arch can be: "foreign", "same", "allowed", "all", "optional" or # empty/non-present. See https://wiki.debian.org/Multiarch/HOWTO @@ -312,12 +316,15 @@ def assemble(cls, package_data, resource, codebase, package_adder): # yield possible dependencies dependent_packages = package_data.dependencies if dependent_packages: - yield from models.Dependency.from_dependent_packages( - dependent_packages=dependent_packages, - datafile_path=res.path, - datasource_id=package_data.datasource_id, - package_uid=package_uid, + deps = list( + models.Dependency.from_dependent_packages( + dependent_packages=dependent_packages, + datafile_path=res.path, + datasource_id=package_data.datasource_id, + package_uid=package_uid, + ) ) + dependencies.extend(deps) resources.append(res) @@ -353,6 +360,7 @@ def assemble(cls, package_data, resource, codebase, package_adder): yield package yield from resources + yield from dependencies class DebianDistrolessInstalledDatabaseHandler(models.DatafileHandler):