1
1
"""Tool for generating Software Bill of Materials (SBOM) for Python's dependencies"""
2
-
2
+ import os
3
3
import re
4
4
import hashlib
5
5
import json
6
6
import glob
7
7
import pathlib
8
8
import subprocess
9
9
import typing
10
+ from urllib .request import urlopen
11
+
12
+ CPYTHON_ROOT_DIR = pathlib .Path (__file__ ).parent .parent .parent
10
13
11
14
# Before adding a new entry to this list, double check that
12
15
# the license expression is a valid SPDX license expression:
@@ -43,15 +46,14 @@ class PackageFiles(typing.NamedTuple):
43
46
# values to 'exclude' if we create new files within tracked
44
47
# directories that aren't sourced from third-party packages.
45
48
PACKAGE_TO_FILES = {
49
+ # NOTE: pip's entry in this structure is automatically generated in
50
+ # the 'discover_pip_sbom_package()' function below.
46
51
"mpdecimal" : PackageFiles (
47
52
include = ["Modules/_decimal/libmpdec/**" ]
48
53
),
49
54
"expat" : PackageFiles (
50
55
include = ["Modules/expat/**" ]
51
56
),
52
- "pip" : PackageFiles (
53
- include = ["Lib/ensurepip/_bundled/pip-23.3.2-py3-none-any.whl" ]
54
- ),
55
57
"macholib" : PackageFiles (
56
58
include = ["Lib/ctypes/macholib/**" ],
57
59
exclude = [
@@ -106,13 +108,106 @@ def filter_gitignored_paths(paths: list[str]) -> list[str]:
106
108
return sorted ([line .split ()[- 1 ] for line in git_check_ignore_lines if line .startswith ("::" )])
107
109
108
110
111
+ def discover_pip_sbom_package (sbom_data : dict [str , typing .Any ]) -> None :
112
+ """pip is a part of a packaging ecosystem (Python, surprise!) so it's actually
113
+ automatable to discover the metadata we need like the version and checksums
114
+ so let's do that on behalf of our friends at the PyPA.
115
+ """
116
+ global PACKAGE_TO_FILES , CPYTHON_ROOT_DIR
117
+
118
+ ensurepip_bundled_dir = CPYTHON_ROOT_DIR / "Lib/ensurepip/_bundled"
119
+ pip_wheels = []
120
+
121
+ # Find the hopefully one pip wheel in the bundled directory.
122
+ for wheel_filename in os .listdir (ensurepip_bundled_dir ):
123
+ if wheel_filename .startswith ("pip-" ):
124
+ pip_wheels .append (wheel_filename )
125
+ if len (pip_wheels ) != 1 :
126
+ print ("Zero or multiple pip wheels detected in 'Lib/ensurepip/_bundled'" )
127
+ exit (1 )
128
+ pip_wheel_filename = pip_wheels [0 ]
129
+
130
+ # Add the wheel filename to the list of files so the SBOM file
131
+ # and relationship generator can work its magic on the wheel too.
132
+ PACKAGE_TO_FILES ["pip" ] = PackageFiles (
133
+ include = [f"Lib/ensurepip/_bundled/{ pip_wheel_filename } " ]
134
+ )
135
+
136
+ # Wheel filename format puts the version right after the project name.
137
+ pip_version = pip_wheel_filename .split ("-" )[1 ]
138
+ pip_checksum_sha256 = hashlib .sha256 (
139
+ (ensurepip_bundled_dir / pip_wheel_filename ).read_bytes ()
140
+ ).hexdigest ()
141
+
142
+ # Get pip's download location from PyPI. Check that the checksum is correct too.
143
+ try :
144
+ raw_text = urlopen (f"https://pypi.org/pypi/pip/{ pip_version } /json" ).read ()
145
+ pip_release_metadata = json .loads (raw_text )
146
+ url : dict [str , typing .Any ]
147
+
148
+ # Look for a matching artifact filename and then check
149
+ # its remote checksum to the local one.
150
+ for url in pip_release_metadata ["urls" ]:
151
+ if url ["filename" ] == pip_wheel_filename :
152
+ break
153
+ else :
154
+ raise ValueError (f"No matching filename on PyPI for '{ pip_wheel_filename } '" )
155
+ if url ["digests" ]["sha256" ] != pip_checksum_sha256 :
156
+ raise ValueError (f"Local pip checksum doesn't match artifact on PyPI" )
157
+
158
+ # Successfully found the download URL for the matching artifact.
159
+ pip_download_url = url ["url" ]
160
+
161
+ except (OSError , ValueError ) as e :
162
+ print (f"Couldn't fetch pip's metadata from PyPI: { e } " )
163
+ exit (1 )
164
+
165
+ # Remove pip from the existing SBOM packages if it's there
166
+ # and then overwrite its entry with our own generated one.
167
+ sbom_data ["packages" ] = [
168
+ sbom_package
169
+ for sbom_package in sbom_data ["packages" ]
170
+ if sbom_package ["name" ] != "pip"
171
+ ]
172
+ sbom_data ["packages" ].append (
173
+ {
174
+ "SPDXID" : spdx_id ("SPDXRef-PACKAGE-pip" ),
175
+ "name" : "pip" ,
176
+ "versionInfo" : pip_version ,
177
+ "originator" : "Organization: Python Packaging Authority" ,
178
+ "licenseConcluded" : "MIT" ,
179
+ "downloadLocation" : pip_download_url ,
180
+ "checksums" : [
181
+ {"algorithm" : "sha256" , "checksumValue" : pip_checksum_sha256 }
182
+ ],
183
+ "externalRefs" : [
184
+ {
185
+ "referenceCategory" : "SECURITY" ,
186
+ "referenceLocator" : f"cpe:2.3:a:pypa:pip:{ pip_version } :*:*:*:*:*:*:*" ,
187
+ "referenceType" : "cpe23Type" ,
188
+ },
189
+ {
190
+ "referenceCategory" : "PACKAGE_MANAGER" ,
191
+ "referenceLocator" : f"pkg:pypi/pip@{ pip_version } " ,
192
+ "referenceType" : "purl" ,
193
+ },
194
+ ],
195
+ "primaryPackagePurpose" : "SOURCE" ,
196
+ }
197
+ )
198
+
199
+
109
200
def main () -> None :
110
- root_dir = pathlib .Path (__file__ ).parent .parent .parent
111
- sbom_path = root_dir / "Misc/sbom.spdx.json"
201
+ sbom_path = CPYTHON_ROOT_DIR / "Misc/sbom.spdx.json"
112
202
sbom_data = json .loads (sbom_path .read_bytes ())
113
203
114
- # Make a bunch of assertions about the SBOM data to ensure it's consistent.
204
+ # Insert pip's SBOM metadata from the wheel.
205
+ discover_pip_sbom_package (sbom_data )
206
+
207
+ # Ensure all packages in this tool are represented also in the SBOM file.
115
208
assert {package ["name" ] for package in sbom_data ["packages" ]} == set (PACKAGE_TO_FILES )
209
+
210
+ # Make a bunch of assertions about the SBOM data to ensure it's consistent.
116
211
for package in sbom_data ["packages" ]:
117
212
118
213
# Properties and ID must be properly formed.
@@ -138,17 +233,17 @@ def main() -> None:
138
233
for include in sorted (files .include ):
139
234
140
235
# Find all the paths and then filter them through .gitignore.
141
- paths = glob .glob (include , root_dir = root_dir , recursive = True )
236
+ paths = glob .glob (include , root_dir = CPYTHON_ROOT_DIR , recursive = True )
142
237
paths = filter_gitignored_paths (paths )
143
238
assert paths , include # Make sure that every value returns something!
144
239
145
240
for path in paths :
146
241
# Skip directories and excluded files
147
- if not (root_dir / path ).is_file () or path in exclude :
242
+ if not (CPYTHON_ROOT_DIR / path ).is_file () or path in exclude :
148
243
continue
149
244
150
245
# SPDX requires SHA1 to be used for files, but we provide SHA256 too.
151
- data = (root_dir / path ).read_bytes ()
246
+ data = (CPYTHON_ROOT_DIR / path ).read_bytes ()
152
247
checksum_sha1 = hashlib .sha1 (data ).hexdigest ()
153
248
checksum_sha256 = hashlib .sha256 (data ).hexdigest ()
154
249
0 commit comments