1
1
"""Tool for generating Software Bill of Materials (SBOM) for Python's dependencies"""
2
-
2
+ import os
3
3
import re
4
4
import hashlib
5
5
import json
6
6
import glob
7
7
import pathlib
8
8
import subprocess
9
+ import sys
9
10
import typing
11
+ from urllib .request import urlopen
12
+
13
+ CPYTHON_ROOT_DIR = pathlib .Path (__file__ ).parent .parent .parent
10
14
11
15
# Before adding a new entry to this list, double check that
12
16
# the license expression is a valid SPDX license expression:
@@ -43,15 +47,14 @@ class PackageFiles(typing.NamedTuple):
43
47
# values to 'exclude' if we create new files within tracked
44
48
# directories that aren't sourced from third-party packages.
45
49
PACKAGE_TO_FILES = {
50
+ # NOTE: pip's entry in this structure is automatically generated in
51
+ # the 'discover_pip_sbom_package()' function below.
46
52
"mpdecimal" : PackageFiles (
47
53
include = ["Modules/_decimal/libmpdec/**" ]
48
54
),
49
55
"expat" : PackageFiles (
50
56
include = ["Modules/expat/**" ]
51
57
),
52
- "pip" : PackageFiles (
53
- include = ["Lib/ensurepip/_bundled/pip-23.3.2-py3-none-any.whl" ]
54
- ),
55
58
"macholib" : PackageFiles (
56
59
include = ["Lib/ctypes/macholib/**" ],
57
60
exclude = [
@@ -106,13 +109,106 @@ def filter_gitignored_paths(paths: list[str]) -> list[str]:
106
109
return sorted ([line .split ()[- 1 ] for line in git_check_ignore_lines if line .startswith ("::" )])
107
110
108
111
112
+ def discover_pip_sbom_package (sbom_data : dict [str , typing .Any ]) -> None :
113
+ """pip is a part of a packaging ecosystem (Python, surprise!) so it's actually
114
+ automatable to discover the metadata we need like the version and checksums
115
+ so let's do that on behalf of our friends at the PyPA.
116
+ """
117
+ global PACKAGE_TO_FILES
118
+
119
+ ensurepip_bundled_dir = CPYTHON_ROOT_DIR / "Lib/ensurepip/_bundled"
120
+ pip_wheels = []
121
+
122
+ # Find the hopefully one pip wheel in the bundled directory.
123
+ for wheel_filename in os .listdir (ensurepip_bundled_dir ):
124
+ if wheel_filename .startswith ("pip-" ):
125
+ pip_wheels .append (wheel_filename )
126
+ if len (pip_wheels ) != 1 :
127
+ print ("Zero or multiple pip wheels detected in 'Lib/ensurepip/_bundled'" )
128
+ sys .exit (1 )
129
+ pip_wheel_filename = pip_wheels [0 ]
130
+
131
+ # Add the wheel filename to the list of files so the SBOM file
132
+ # and relationship generator can work its magic on the wheel too.
133
+ PACKAGE_TO_FILES ["pip" ] = PackageFiles (
134
+ include = [f"Lib/ensurepip/_bundled/{ pip_wheel_filename } " ]
135
+ )
136
+
137
+ # Wheel filename format puts the version right after the project name.
138
+ pip_version = pip_wheel_filename .split ("-" )[1 ]
139
+ pip_checksum_sha256 = hashlib .sha256 (
140
+ (ensurepip_bundled_dir / pip_wheel_filename ).read_bytes ()
141
+ ).hexdigest ()
142
+
143
+ # Get pip's download location from PyPI. Check that the checksum is correct too.
144
+ try :
145
+ raw_text = urlopen (f"https://pypi.org/pypi/pip/{ pip_version } /json" ).read ()
146
+ pip_release_metadata = json .loads (raw_text )
147
+ url : dict [str , typing .Any ]
148
+
149
+ # Look for a matching artifact filename and then check
150
+ # its remote checksum to the local one.
151
+ for url in pip_release_metadata ["urls" ]:
152
+ if url ["filename" ] == pip_wheel_filename :
153
+ break
154
+ else :
155
+ raise ValueError (f"No matching filename on PyPI for '{ pip_wheel_filename } '" )
156
+ if url ["digests" ]["sha256" ] != pip_checksum_sha256 :
157
+ raise ValueError (f"Local pip checksum doesn't match artifact on PyPI" )
158
+
159
+ # Successfully found the download URL for the matching artifact.
160
+ pip_download_url = url ["url" ]
161
+
162
+ except (OSError , ValueError ) as e :
163
+ print (f"Couldn't fetch pip's metadata from PyPI: { e } " )
164
+ sys .exit (1 )
165
+
166
+ # Remove pip from the existing SBOM packages if it's there
167
+ # and then overwrite its entry with our own generated one.
168
+ sbom_data ["packages" ] = [
169
+ sbom_package
170
+ for sbom_package in sbom_data ["packages" ]
171
+ if sbom_package ["name" ] != "pip"
172
+ ]
173
+ sbom_data ["packages" ].append (
174
+ {
175
+ "SPDXID" : spdx_id ("SPDXRef-PACKAGE-pip" ),
176
+ "name" : "pip" ,
177
+ "versionInfo" : pip_version ,
178
+ "originator" : "Organization: Python Packaging Authority" ,
179
+ "licenseConcluded" : "MIT" ,
180
+ "downloadLocation" : pip_download_url ,
181
+ "checksums" : [
182
+ {"algorithm" : "SHA256" , "checksumValue" : pip_checksum_sha256 }
183
+ ],
184
+ "externalRefs" : [
185
+ {
186
+ "referenceCategory" : "SECURITY" ,
187
+ "referenceLocator" : f"cpe:2.3:a:pypa:pip:{ pip_version } :*:*:*:*:*:*:*" ,
188
+ "referenceType" : "cpe23Type" ,
189
+ },
190
+ {
191
+ "referenceCategory" : "PACKAGE_MANAGER" ,
192
+ "referenceLocator" : f"pkg:pypi/pip@{ pip_version } " ,
193
+ "referenceType" : "purl" ,
194
+ },
195
+ ],
196
+ "primaryPackagePurpose" : "SOURCE" ,
197
+ }
198
+ )
199
+
200
+
109
201
def main () -> None :
110
- root_dir = pathlib .Path (__file__ ).parent .parent .parent
111
- sbom_path = root_dir / "Misc/sbom.spdx.json"
202
+ sbom_path = CPYTHON_ROOT_DIR / "Misc/sbom.spdx.json"
112
203
sbom_data = json .loads (sbom_path .read_bytes ())
113
204
114
- # Make a bunch of assertions about the SBOM data to ensure it's consistent.
205
+ # Insert pip's SBOM metadata from the wheel.
206
+ discover_pip_sbom_package (sbom_data )
207
+
208
+ # Ensure all packages in this tool are represented also in the SBOM file.
115
209
assert {package ["name" ] for package in sbom_data ["packages" ]} == set (PACKAGE_TO_FILES )
210
+
211
+ # Make a bunch of assertions about the SBOM data to ensure it's consistent.
116
212
for package in sbom_data ["packages" ]:
117
213
118
214
# Properties and ID must be properly formed.
@@ -138,17 +234,17 @@ def main() -> None:
138
234
for include in sorted (files .include ):
139
235
140
236
# Find all the paths and then filter them through .gitignore.
141
- paths = glob .glob (include , root_dir = root_dir , recursive = True )
237
+ paths = glob .glob (include , root_dir = CPYTHON_ROOT_DIR , recursive = True )
142
238
paths = filter_gitignored_paths (paths )
143
239
assert paths , include # Make sure that every value returns something!
144
240
145
241
for path in paths :
146
242
# Skip directories and excluded files
147
- if not (root_dir / path ).is_file () or path in exclude :
243
+ if not (CPYTHON_ROOT_DIR / path ).is_file () or path in exclude :
148
244
continue
149
245
150
246
# SPDX requires SHA1 to be used for files, but we provide SHA256 too.
151
- data = (root_dir / path ).read_bytes ()
247
+ data = (CPYTHON_ROOT_DIR / path ).read_bytes ()
152
248
checksum_sha1 = hashlib .sha1 (data ).hexdigest ()
153
249
checksum_sha256 = hashlib .sha256 (data ).hexdigest ()
154
250
0 commit comments