Skip to content

Commit 8963d7f

Browse files
committed
Update scan_for_application_packages #436 #444 #447
* Update scan_for_application_packages to save detected Package data to the CodebaseResource it is from, then iterate through the CodebaseResources with Package data and use the proper Package handler to process the Package data * Create DiscoveredDependency model * Add package_data JSON field to CodebaseResource Signed-off-by: Jono Yang <[email protected]>
1 parent 6163473 commit 8963d7f

File tree

4 files changed

+297
-5
lines changed

4 files changed

+297
-5
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Generated by Django 4.0.6 on 2022-07-20 23:58
2+
3+
from django.db import migrations, models
4+
import django.db.models.deletion
5+
import scanpipe.models
6+
7+
8+
class Migration(migrations.Migration):
9+
10+
dependencies = [
11+
('scanpipe', '0018_codebaseresource_tag'),
12+
]
13+
14+
operations = [
15+
migrations.AddField(
16+
model_name='codebaseresource',
17+
name='package_data',
18+
field=models.JSONField(blank=True, default=dict, help_text='List of Package data detected from this CodebaseResource'),
19+
),
20+
migrations.CreateModel(
21+
name='DiscoveredDependency',
22+
fields=[
23+
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
24+
('purl', models.CharField(help_text='The Package URL of this dependency.', max_length=1024)),
25+
('extracted_requirement', models.CharField(help_text='The version requirements of this dependency.', max_length=32)),
26+
('scope', models.CharField(help_text='The scope of this dependency, how it is used in a project.', max_length=32)),
27+
('is_runtime', models.BooleanField(default=False)),
28+
('is_optional', models.BooleanField(default=False)),
29+
('is_resolved', models.BooleanField(default=False)),
30+
('dependency_uid', models.CharField(help_text='The unique identifier of this dependency.', max_length=1024)),
31+
('for_package_uid', models.CharField(help_text='The unique identifier of the package this dependency is for.', max_length=1024)),
32+
('datafile_path', models.CharField(blank=True, help_text='The relative path to the datafile where this dependency was detected from.', max_length=1024)),
33+
('datasource_id', models.CharField(help_text='The identifier for the datafile handler used to obtain this dependency.', max_length=64)),
34+
('project', models.ForeignKey(editable=False, on_delete=django.db.models.deletion.CASCADE, related_name='%(class)ss', to='scanpipe.project')),
35+
],
36+
options={
37+
'abstract': False,
38+
},
39+
bases=(models.Model, scanpipe.models.SaveProjectErrorMixin),
40+
),
41+
]

scanpipe/models.py

+154
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
import django_rq
5555
import redis
5656
import requests
57+
from commoncode.fileutils import parent_directory
5758
from commoncode.hash import multi_checksums
5859
from packageurl import PackageURL
5960
from packageurl import normalize_qualifiers
@@ -1466,6 +1467,12 @@ class Compliance(models.TextChoices):
14661467
),
14671468
)
14681469

1470+
package_data = models.JSONField(
1471+
default=dict,
1472+
blank=True,
1473+
help_text=_("List of Package data detected from this CodebaseResource"),
1474+
)
1475+
14691476
objects = CodebaseResourceQuerySet.as_manager()
14701477

14711478
class Meta:
@@ -1582,6 +1589,56 @@ def unique_license_expressions(self):
15821589
"""
15831590
return sorted(set(self.license_expressions))
15841591

1592+
def parent_path(self):
1593+
"""
1594+
Return the parent path for this CodebaseResource or None.
1595+
"""
1596+
return parent_directory(self.path, with_trail=False)
1597+
1598+
def has_parent(self):
1599+
"""
1600+
Return True if this CodebaseResource has a parent CodebaseResource or
1601+
False otherwise.
1602+
"""
1603+
parent_path = self.parent_path()
1604+
if not parent_path:
1605+
return False
1606+
if self.project.codebaseresources.filter(path=parent_path).exists():
1607+
return True
1608+
return False
1609+
1610+
def parent(self, codebase=None):
1611+
"""
1612+
Return the parent CodebaseResource object for this CodebaseResource or
1613+
None.
1614+
1615+
`codebase` is not used in this context but required for compatibility
1616+
with the commoncode.resource.Codebase class API.
1617+
"""
1618+
parent_path = self.parent_path()
1619+
return parent_path and self.project.codebaseresources.get(path=parent_path)
1620+
1621+
def has_siblings(self, codebase=None):
1622+
"""
1623+
Return True is this CodebaseResource has siblings.
1624+
1625+
`codebase` is not used in this context but required for compatibility
1626+
with the commoncode.resource.Codebase class API.
1627+
"""
1628+
return self.has_parent() and self.parent(codebase).has_children()
1629+
1630+
def siblings(self, codebase=None):
1631+
"""
1632+
Return a sequence of sibling Resource objects for this Resource
1633+
or an empty sequence.
1634+
1635+
`codebase` is not used in this context but required for compatibility
1636+
with the commoncode.resource.Codebase class API.
1637+
"""
1638+
if self.has_parent():
1639+
return self.parent(codebase).children(codebase)
1640+
return []
1641+
15851642
def descendants(self):
15861643
"""
15871644
Returns a QuerySet of descendant CodebaseResource objects using a
@@ -1847,6 +1904,103 @@ def update_from_data(self, package_data, override=False):
18471904
return updated_fields
18481905

18491906

1907+
class DiscoveredDependency(
1908+
ProjectRelatedModel,
1909+
SaveProjectErrorMixin,
1910+
):
1911+
"""
1912+
A project's Discovered Dependencies are records of the dependencies used by
1913+
system and application packages discovered in the code under analysis.
1914+
"""
1915+
purl = models.CharField(
1916+
max_length=1024,
1917+
help_text=_(
1918+
"The Package URL of this dependency."
1919+
),
1920+
)
1921+
extracted_requirement = models.CharField(
1922+
max_length=32,
1923+
help_text=_(
1924+
"The version requirements of this dependency."
1925+
),
1926+
)
1927+
scope = models.CharField(
1928+
max_length=32,
1929+
help_text=_(
1930+
"The scope of this dependency, how it is used in a project."
1931+
),
1932+
)
1933+
1934+
is_runtime = models.BooleanField(default=False)
1935+
is_optional = models.BooleanField(default=False)
1936+
is_resolved = models.BooleanField(default=False)
1937+
1938+
dependency_uid = models.CharField(
1939+
max_length=1024,
1940+
help_text=_(
1941+
"The unique identifier of this dependency."
1942+
),
1943+
)
1944+
for_package_uid = models.CharField(
1945+
max_length=1024,
1946+
help_text=_(
1947+
"The unique identifier of the package this dependency is for."
1948+
),
1949+
)
1950+
datafile_path = models.CharField(
1951+
max_length=1024,
1952+
blank=True,
1953+
help_text=_(
1954+
"The relative path to the datafile where this dependency was detected from."
1955+
),
1956+
)
1957+
datasource_id = models.CharField(
1958+
max_length=64,
1959+
help_text=_(
1960+
"The identifier for the datafile handler used to obtain this dependency."
1961+
)
1962+
)
1963+
1964+
@classmethod
1965+
def create_from_data(cls, project, dependency_data):
1966+
"""
1967+
Creates and returns a DiscoveredPackage for a `project` from the `dependency_data`.
1968+
"""
1969+
if "resolved_package" in dependency_data:
1970+
dependency_data.pop("resolved_package")
1971+
discovered_dependency = cls(project=project, **dependency_data)
1972+
discovered_dependency.save()
1973+
return discovered_dependency
1974+
1975+
def update_from_data(self, dependency_data):
1976+
"""
1977+
Update this discovered dependency instance with the provided `dependency_data`.
1978+
The `save()` is called only if at least one field was modified.
1979+
"""
1980+
model_fields = DiscoveredPackage.model_fields()
1981+
updated_fields = []
1982+
1983+
for field_name, value in dependency_data.items():
1984+
skip_reasons = [
1985+
not value,
1986+
field_name not in model_fields,
1987+
]
1988+
if any(skip_reasons):
1989+
continue
1990+
1991+
current_value = getattr(self, field_name, None)
1992+
if not current_value:
1993+
setattr(self, field_name, value)
1994+
updated_fields.append(field_name)
1995+
elif current_value != value:
1996+
pass # TODO: handle this case
1997+
1998+
if updated_fields:
1999+
self.save()
2000+
2001+
return updated_fields
2002+
2003+
18502004
class WebhookSubscription(UUIDPKModel, ProjectRelatedModel):
18512005
target_url = models.URLField(_("Target URL"), max_length=1024)
18522006
sent = models.BooleanField(default=False)

scanpipe/pipes/__init__.py

+24
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from django.db.models import Count
3131

3232
from scanpipe.models import CodebaseResource
33+
from scanpipe.models import DiscoveredDependency
3334
from scanpipe.models import DiscoveredPackage
3435
from scanpipe.pipes import scancode
3536

@@ -104,6 +105,29 @@ def update_or_create_package(project, package_data, codebase_resource=None):
104105
return package
105106

106107

108+
def update_or_create_dependencies(project, dependency_data):
109+
"""
110+
Gets, updates or creates a DiscoveredDependency then returns it.
111+
Uses the `project` and `dependency_data` mapping to lookup and creates the
112+
DiscoveredDependency using its dependency_uid and for_package_uid as a unique key.
113+
"""
114+
try:
115+
dependency = DiscoveredDependency.objects.get(
116+
project=project,
117+
dependency_uid=dependency_data.get("dependency_uid"),
118+
for_package_uid=dependency_data.get("for_package_uid"),
119+
)
120+
except DiscoveredDependency.DoesNotExist:
121+
dependency = None
122+
123+
if dependency:
124+
dependency.update_from_data(dependency_data)
125+
else:
126+
dependency = DiscoveredDependency.create_from_data(project, dependency_data)
127+
128+
return dependency
129+
130+
107131
def analyze_scanned_files(project):
108132
"""
109133
Sets the status for CodebaseResource to unknown or no license.

scanpipe/pipes/scancode.py

+78-5
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,16 @@
3535

3636
from commoncode import fileutils
3737
from commoncode.resource import VirtualCodebase
38+
from packagedcode import get_package_handler
39+
from packagedcode import models as packagedcode_models
3840
from extractcode import api as extractcode_api
3941
from scancode import ScancodeError
4042
from scancode import Scanner
4143
from scancode import api as scancode_api
4244
from scancode import cli as scancode_cli
4345

4446
from scanpipe import pipes
47+
from scanpipe.pipes.codebase import ProjectCodebase
4548
from scanpipe.models import CodebaseResource
4649

4750
logger = logging.getLogger("scanpipe.pipes")
@@ -230,10 +233,9 @@ def save_scan_package_results(codebase_resource, scan_results, scan_errors):
230233
Saves the resource scan package results in the database.
231234
Creates project errors if any occurred during the scan.
232235
"""
233-
packages = scan_results.get("package_data", [])
234-
if packages:
235-
for package_data in packages:
236-
codebase_resource.create_and_add_package(package_data)
236+
package_data = scan_results.get("package_data", [])
237+
if package_data:
238+
codebase_resource.package_data = package_data
237239
codebase_resource.status = "application-package"
238240
codebase_resource.save()
239241

@@ -310,18 +312,89 @@ def scan_for_files(project):
310312

311313
def scan_for_application_packages(project):
312314
"""
313-
Runs a package scan on files without a status for a `project`.
315+
Runs a package scan on files without a status for a `project`,
316+
then create DiscoveredPackage and DiscoveredDependency instances
317+
from the detected package data
314318
315319
Multiprocessing is enabled by default on this pipe, the number of processes can be
316320
controlled through the SCANCODEIO_PROCESSES setting.
317321
"""
318322
resource_qs = project.codebaseresources.no_status()
323+
324+
# Collect detected Package data and save it to the CodebaseResource it was
325+
# detected from
319326
_scan_and_save(
320327
resource_qs=resource_qs,
321328
scan_func=scan_for_package_data,
322329
save_func=save_scan_package_results,
323330
)
324331

332+
# Iterate through CodebaseResources with Package data and handle them using
333+
# the proper Package handler from packagedcode
334+
assemble_packages(project=project)
335+
336+
337+
def assemble_packages(project):
338+
"""
339+
Create instances of DiscoveredPackage and DiscoveredDependency for `project`
340+
from the parsed package data present in the CodebaseResources of `project`.
341+
"""
342+
seen_resource_paths = set()
343+
package_data_resources_qs = project.codebaseresources.filter(package_data__isnull=False)
344+
for resource in package_data_resources_qs:
345+
if resource.path in seen_resource_paths:
346+
continue
347+
348+
logger.info(
349+
f"project: {project}:\n"
350+
"function: assemble_packages\n"
351+
f"Processing: CodebaseResource {resource.path}\n"
352+
)
353+
354+
for package_data in resource.package_data:
355+
package_data = packagedcode_models.PackageData.from_dict(mapping=package_data)
356+
357+
logger.info(
358+
f"project: {project}:\n"
359+
"function: assemble_packages\n"
360+
f"Processing: PackageData {package_data.purl}\n"
361+
)
362+
363+
handler = get_package_handler(package_data)
364+
365+
logger.info(
366+
f"project: {project}:\n"
367+
"function: assemble_packages\n"
368+
f"Selected: Package handler {handler}\n"
369+
)
370+
371+
items = handler.assemble(
372+
package_data=package_data,
373+
resource=resource,
374+
codebase=None,
375+
)
376+
377+
for item in items:
378+
logger.info(
379+
f"project: {project}:\n"
380+
"function: assemble_packages\n"
381+
f"Processing: item {item}\n"
382+
)
383+
if isinstance(item, packagedcode_models.Package):
384+
package_data = item.to_dict()
385+
pipes.update_or_create_package(project, package_data)
386+
elif isinstance(item, packagedcode_models.Dependency):
387+
dependency_data = item.to_dict()
388+
pipes.update_or_create_dependencies(project, dependency_data)
389+
elif isinstance(item, CodebaseResource):
390+
seen_resource_paths.add(item.path)
391+
else:
392+
logger.info(
393+
f"project: {project}:\n"
394+
"function: assemble_packages\n"
395+
f"Unknown Package assembly item type: {item!r}\n"
396+
)
397+
325398

326399
def run_scancode(location, output_file, options, raise_on_error=False):
327400
"""

0 commit comments

Comments
 (0)