Skip to content

Commit b221e03

Browse files
sethmlarsonhugovk
andauthored
gh-113257: Automatically generate pip SBOM metadata from wheel (#113295)
Co-authored-by: Hugo van Kemenade <[email protected]>
1 parent 11ee912 commit b221e03

File tree

2 files changed

+107
-11
lines changed

2 files changed

+107
-11
lines changed

Misc/sbom.spdx.json

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Tools/build/generate_sbom.py

Lines changed: 106 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
"""Tool for generating Software Bill of Materials (SBOM) for Python's dependencies"""
2-
2+
import os
33
import re
44
import hashlib
55
import json
66
import glob
77
import pathlib
88
import subprocess
9+
import sys
910
import typing
11+
from urllib.request import urlopen
12+
13+
CPYTHON_ROOT_DIR = pathlib.Path(__file__).parent.parent.parent
1014

1115
# Before adding a new entry to this list, double check that
1216
# the license expression is a valid SPDX license expression:
@@ -43,15 +47,14 @@ class PackageFiles(typing.NamedTuple):
4347
# values to 'exclude' if we create new files within tracked
4448
# directories that aren't sourced from third-party packages.
4549
PACKAGE_TO_FILES = {
50+
# NOTE: pip's entry in this structure is automatically generated in
51+
# the 'discover_pip_sbom_package()' function below.
4652
"mpdecimal": PackageFiles(
4753
include=["Modules/_decimal/libmpdec/**"]
4854
),
4955
"expat": PackageFiles(
5056
include=["Modules/expat/**"]
5157
),
52-
"pip": PackageFiles(
53-
include=["Lib/ensurepip/_bundled/pip-23.3.2-py3-none-any.whl"]
54-
),
5558
"macholib": PackageFiles(
5659
include=["Lib/ctypes/macholib/**"],
5760
exclude=[
@@ -106,13 +109,106 @@ def filter_gitignored_paths(paths: list[str]) -> list[str]:
106109
return sorted([line.split()[-1] for line in git_check_ignore_lines if line.startswith("::")])
107110

108111

112+
def discover_pip_sbom_package(sbom_data: dict[str, typing.Any]) -> None:
113+
"""pip is a part of a packaging ecosystem (Python, surprise!) so it's actually
114+
automatable to discover the metadata we need like the version and checksums
115+
so let's do that on behalf of our friends at the PyPA.
116+
"""
117+
global PACKAGE_TO_FILES
118+
119+
ensurepip_bundled_dir = CPYTHON_ROOT_DIR / "Lib/ensurepip/_bundled"
120+
pip_wheels = []
121+
122+
# Find the hopefully one pip wheel in the bundled directory.
123+
for wheel_filename in os.listdir(ensurepip_bundled_dir):
124+
if wheel_filename.startswith("pip-"):
125+
pip_wheels.append(wheel_filename)
126+
if len(pip_wheels) != 1:
127+
print("Zero or multiple pip wheels detected in 'Lib/ensurepip/_bundled'")
128+
sys.exit(1)
129+
pip_wheel_filename = pip_wheels[0]
130+
131+
# Add the wheel filename to the list of files so the SBOM file
132+
# and relationship generator can work its magic on the wheel too.
133+
PACKAGE_TO_FILES["pip"] = PackageFiles(
134+
include=[f"Lib/ensurepip/_bundled/{pip_wheel_filename}"]
135+
)
136+
137+
# Wheel filename format puts the version right after the project name.
138+
pip_version = pip_wheel_filename.split("-")[1]
139+
pip_checksum_sha256 = hashlib.sha256(
140+
(ensurepip_bundled_dir / pip_wheel_filename).read_bytes()
141+
).hexdigest()
142+
143+
# Get pip's download location from PyPI. Check that the checksum is correct too.
144+
try:
145+
raw_text = urlopen(f"https://pypi.org/pypi/pip/{pip_version}/json").read()
146+
pip_release_metadata = json.loads(raw_text)
147+
url: dict[str, typing.Any]
148+
149+
# Look for a matching artifact filename and then check
150+
# its remote checksum to the local one.
151+
for url in pip_release_metadata["urls"]:
152+
if url["filename"] == pip_wheel_filename:
153+
break
154+
else:
155+
raise ValueError(f"No matching filename on PyPI for '{pip_wheel_filename}'")
156+
if url["digests"]["sha256"] != pip_checksum_sha256:
157+
raise ValueError(f"Local pip checksum doesn't match artifact on PyPI")
158+
159+
# Successfully found the download URL for the matching artifact.
160+
pip_download_url = url["url"]
161+
162+
except (OSError, ValueError) as e:
163+
print(f"Couldn't fetch pip's metadata from PyPI: {e}")
164+
sys.exit(1)
165+
166+
# Remove pip from the existing SBOM packages if it's there
167+
# and then overwrite its entry with our own generated one.
168+
sbom_data["packages"] = [
169+
sbom_package
170+
for sbom_package in sbom_data["packages"]
171+
if sbom_package["name"] != "pip"
172+
]
173+
sbom_data["packages"].append(
174+
{
175+
"SPDXID": spdx_id("SPDXRef-PACKAGE-pip"),
176+
"name": "pip",
177+
"versionInfo": pip_version,
178+
"originator": "Organization: Python Packaging Authority",
179+
"licenseConcluded": "MIT",
180+
"downloadLocation": pip_download_url,
181+
"checksums": [
182+
{"algorithm": "SHA256", "checksumValue": pip_checksum_sha256}
183+
],
184+
"externalRefs": [
185+
{
186+
"referenceCategory": "SECURITY",
187+
"referenceLocator": f"cpe:2.3:a:pypa:pip:{pip_version}:*:*:*:*:*:*:*",
188+
"referenceType": "cpe23Type",
189+
},
190+
{
191+
"referenceCategory": "PACKAGE_MANAGER",
192+
"referenceLocator": f"pkg:pypi/pip@{pip_version}",
193+
"referenceType": "purl",
194+
},
195+
],
196+
"primaryPackagePurpose": "SOURCE",
197+
}
198+
)
199+
200+
109201
def main() -> None:
110-
root_dir = pathlib.Path(__file__).parent.parent.parent
111-
sbom_path = root_dir / "Misc/sbom.spdx.json"
202+
sbom_path = CPYTHON_ROOT_DIR / "Misc/sbom.spdx.json"
112203
sbom_data = json.loads(sbom_path.read_bytes())
113204

114-
# Make a bunch of assertions about the SBOM data to ensure it's consistent.
205+
# Insert pip's SBOM metadata from the wheel.
206+
discover_pip_sbom_package(sbom_data)
207+
208+
# Ensure all packages in this tool are represented also in the SBOM file.
115209
assert {package["name"] for package in sbom_data["packages"]} == set(PACKAGE_TO_FILES)
210+
211+
# Make a bunch of assertions about the SBOM data to ensure it's consistent.
116212
for package in sbom_data["packages"]:
117213

118214
# Properties and ID must be properly formed.
@@ -138,17 +234,17 @@ def main() -> None:
138234
for include in sorted(files.include):
139235

140236
# Find all the paths and then filter them through .gitignore.
141-
paths = glob.glob(include, root_dir=root_dir, recursive=True)
237+
paths = glob.glob(include, root_dir=CPYTHON_ROOT_DIR, recursive=True)
142238
paths = filter_gitignored_paths(paths)
143239
assert paths, include # Make sure that every value returns something!
144240

145241
for path in paths:
146242
# Skip directories and excluded files
147-
if not (root_dir / path).is_file() or path in exclude:
243+
if not (CPYTHON_ROOT_DIR / path).is_file() or path in exclude:
148244
continue
149245

150246
# SPDX requires SHA1 to be used for files, but we provide SHA256 too.
151-
data = (root_dir / path).read_bytes()
247+
data = (CPYTHON_ROOT_DIR / path).read_bytes()
152248
checksum_sha1 = hashlib.sha1(data).hexdigest()
153249
checksum_sha256 = hashlib.sha256(data).hexdigest()
154250

0 commit comments

Comments
 (0)