Skip to content

Commit 6e41a3d

Browse files
authored
feat: check PyPI registry when deps.dev fails to find a source repository (#982)
This PR adds a fallback option for PyPI PURLs in cases where deps.dev does not report their repositories. Instead, the PyPI registry is used to find the appropriate repository URL. Signed-off-by: Ben Selwyn-Smith <[email protected]>
1 parent 678d953 commit 6e41a3d

34 files changed

+421
-206
lines changed

src/macaron/__main__.py

+2
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,8 @@ def perform_action(action_args: argparse.Namespace) -> None:
276276
try:
277277
for git_service in GIT_SERVICES:
278278
git_service.load_defaults()
279+
for package_registry in PACKAGE_REGISTRIES:
280+
package_registry.load_defaults()
279281
except ConfigurationError as error:
280282
logger.error(error)
281283
sys.exit(os.EX_USAGE)

src/macaron/json_tools.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
33

44
"""This module provides utility functions for JSON data."""
@@ -53,5 +53,5 @@ def json_extract(entry: dict | list, keys: Sequence[str | int], type_: type[T])
5353
if isinstance(entry, type_):
5454
return entry
5555

56-
logger.debug("Found value of incorrect type: %s instead of %s.", type(entry), type(type_))
56+
logger.debug("Found value of incorrect type: %s instead of %s.", type(entry), type_)
5757
return None

src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
9595
The result and related information collected during the analysis.
9696
"""
9797
maintainers_join_date: list[datetime] | None = self._get_maintainers_join_date(
98-
pypi_package_json.pypi_registry, pypi_package_json.component.name
98+
pypi_package_json.pypi_registry, pypi_package_json.component_name
9999
)
100100
latest_release_date: datetime | None = self._get_latest_release_date(pypi_package_json)
101101
detail_info: dict[str, JsonType] = {

src/macaron/malware_analyzer/pypi_heuristics/metadata/source_code_repo.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,6 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
4141
The result and related information collected during the analysis.
4242
"""
4343
# If a sourcecode repo exists, then this will have already been validated
44-
if not pypi_package_json.component.repository:
44+
if not pypi_package_json.has_repository:
4545
return HeuristicResult.FAIL, {}
4646
return HeuristicResult.PASS, {}

src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
6161
logger.debug(error_msg)
6262
raise HeuristicAnalyzerValueError(error_msg)
6363

64-
version = pypi_package_json.component.version
64+
version = pypi_package_json.component_version
6565
if version is None: # check latest release version
6666
version = pypi_package_json.get_latest_version()
6767

src/macaron/repo_finder/repo_finder.py

+49-2
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from macaron.config.defaults import defaults
4444
from macaron.config.global_config import global_config
4545
from macaron.errors import CloneError, RepoCheckOutError
46-
from macaron.repo_finder import to_domain_from_known_purl_types
46+
from macaron.repo_finder import repo_finder_pypi, to_domain_from_known_purl_types
4747
from macaron.repo_finder.commit_finder import find_commit, match_tags
4848
from macaron.repo_finder.repo_finder_base import BaseRepoFinder
4949
from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder
@@ -66,11 +66,16 @@
6666
list_remote_references,
6767
resolve_local_path,
6868
)
69+
from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo
6970

7071
logger: logging.Logger = logging.getLogger(__name__)
7172

7273

73-
def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, RepoFinderInfo]:
74+
def find_repo(
75+
purl: PackageURL,
76+
check_latest_version: bool = True,
77+
package_registries_info: list[PackageRegistryInfo] | None = None,
78+
) -> tuple[str, RepoFinderInfo]:
7479
"""Retrieve the repository URL that matches the given PURL.
7580
7681
Parameters
@@ -79,6 +84,9 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str,
7984
The parsed PURL to convert to the repository path.
8085
check_latest_version: bool
8186
A flag that determines whether the latest version of the PURL is also checked.
87+
package_registries_info: list[PackageRegistryInfo] | None
88+
The list of package registry information if available.
89+
If no package registries are loaded, this can be set to None.
8290
8391
Returns
8492
-------
@@ -103,6 +111,9 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str,
103111
logger.debug("Analyzing %s with Repo Finder: %s", purl, type(repo_finder))
104112
found_repo, outcome = repo_finder.find_repo(purl)
105113

114+
if not found_repo:
115+
found_repo, outcome = find_repo_alternative(purl, outcome, package_registries_info)
116+
106117
if check_latest_version and not defaults.getboolean("repofinder", "try_latest_purl", fallback=True):
107118
check_latest_version = False
108119

@@ -117,13 +128,49 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str,
117128
return "", RepoFinderInfo.NO_NEWER_VERSION
118129

119130
found_repo, outcome = DepsDevRepoFinder().find_repo(latest_version_purl)
131+
if found_repo:
132+
return found_repo, outcome
133+
134+
if not found_repo:
135+
found_repo, outcome = find_repo_alternative(latest_version_purl, outcome, package_registries_info)
136+
120137
if not found_repo:
121138
logger.debug("Could not find repo from latest version of PURL: %s", latest_version_purl)
122139
return "", RepoFinderInfo.LATEST_VERSION_INVALID
123140

124141
return found_repo, outcome
125142

126143

144+
def find_repo_alternative(
145+
purl: PackageURL, outcome: RepoFinderInfo, package_registries_info: list[PackageRegistryInfo] | None = None
146+
) -> tuple[str, RepoFinderInfo]:
147+
"""Use PURL type specific methods to find the repository when the standard methods have failed.
148+
149+
Parameters
150+
----------
151+
purl : PackageURL
152+
The parsed PURL to convert to the repository path.
153+
outcome: RepoFinderInfo
154+
A previous outcome to report if this method does nothing.
155+
package_registries_info: list[PackageRegistryInfo] | None
156+
The list of package registry information if available.
157+
If no package registries are loaded, this can be set to None.
158+
159+
Returns
160+
-------
161+
tuple[str, RepoFinderOutcome] :
162+
The repository URL for the passed package, if found, and the outcome to report.
163+
"""
164+
found_repo = ""
165+
if purl.type == "pypi":
166+
found_repo, outcome = repo_finder_pypi.find_repo(purl, package_registries_info)
167+
168+
if not found_repo:
169+
logger.debug("Could not find repository using type specific (%s) methods for PURL: %s", purl.type, purl)
170+
171+
return found_repo, outcome
172+
173+
127174
def to_repo_path(purl: PackageURL, available_domains: list[str]) -> str | None:
128175
"""Return the repository path from the PURL string.
129176

src/macaron/repo_finder/repo_finder_enums.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,18 @@ class RepoFinderInfo(Enum):
5757
#: Reported if deps.dev returns data that does not contain the desired SCM URL. E.g. The repository URL.
5858
DDEV_NO_URLS = "deps.dev no URLs"
5959

60+
#: Reported if there was an error with the request sent to the PyPI registry.
61+
PYPI_HTTP_ERROR = "PyPI HTTP error"
62+
63+
#: Reported if there was an error parsing the JSON returned by the PyPI registry.
64+
PYPI_JSON_ERROR = "PyPI JSON error"
65+
66+
#: Reported if there was no matching URLs in the JSON returned by the PyPI registry.
67+
PYPI_NO_URLS = "PyPI no matching URLs"
68+
69+
#: Reported if the PyPI registry is disabled or not present in the list of package registries.
70+
PYPI_NO_REGISTRY = "PyPI registry disabled or absent"
71+
6072
#: Reported if the provided PURL did not produce a result, but a more recent version could not be found.
6173
NO_NEWER_VERSION = "No newer version than provided which failed"
6274

@@ -70,7 +82,10 @@ class RepoFinderInfo(Enum):
7082
FOUND_FROM_PARENT = "Found from parent"
7183

7284
#: Reported when a repository is found from a more recent version than was provided by the user.
73-
FOUND_FROM_LATEST = "Found form latest"
85+
FOUND_FROM_LATEST = "Found from latest"
86+
87+
#: Reported when a repository could only be found by checking the PyPI registry JSON.
88+
FOUND_FROM_PYPI = "Found from PyPI"
7489

7590
#: Default value. Reported if the Repo Finder was not called. E.g. Because the repository URL was already present.
7691
NOT_USED = "Not used"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""This module contains the logic for finding repositories of PyPI projects."""
5+
import logging
6+
7+
from packageurl import PackageURL
8+
9+
from macaron.repo_finder.repo_finder_enums import RepoFinderInfo
10+
from macaron.repo_finder.repo_validator import find_valid_repository_url
11+
from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, PyPIRegistry
12+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
13+
from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo
14+
15+
logger: logging.Logger = logging.getLogger(__name__)
16+
17+
18+
def find_repo(
19+
purl: PackageURL, package_registries_info: list[PackageRegistryInfo] | None = None
20+
) -> tuple[str, RepoFinderInfo]:
21+
"""Retrieve the repository URL that matches the given PyPI PURL.
22+
23+
Parameters
24+
----------
25+
purl : PackageURL
26+
The parsed PURL to convert to the repository path.
27+
package_registries_info: list[PackageRegistryInfo] | None
28+
The list of package registry information if available.
29+
If no package registries are loaded, this can be set to None.
30+
31+
Returns
32+
-------
33+
tuple[str, RepoFinderOutcome] :
34+
The repository URL for the passed package, if found, and the outcome to report.
35+
"""
36+
pypi_info = None
37+
if package_registries_info:
38+
# Find the package registry info object that contains the PyPI registry and has the pypi build tool.
39+
pypi_info = next(
40+
(
41+
info
42+
for info in package_registries_info
43+
if isinstance(info.package_registry, PyPIRegistry) and info.build_tool_name in {"poetry", "pip"}
44+
),
45+
None,
46+
)
47+
48+
if not pypi_info or not isinstance(pypi_info.package_registry, PyPIRegistry):
49+
pypi_registry = next((registry for registry in PACKAGE_REGISTRIES if isinstance(registry, PyPIRegistry)), None)
50+
else:
51+
pypi_registry = pypi_info.package_registry
52+
53+
if not pypi_registry:
54+
logger.debug("PyPI package registry not available.")
55+
return "", RepoFinderInfo.PYPI_NO_REGISTRY
56+
57+
pypi_asset = None
58+
from_metadata = False
59+
if pypi_info:
60+
for existing_asset in pypi_info.metadata:
61+
if not isinstance(existing_asset, PyPIPackageJsonAsset):
62+
continue
63+
64+
if existing_asset.component_name == purl.name and existing_asset.component_version == purl.version:
65+
pypi_asset = existing_asset
66+
from_metadata = True
67+
break
68+
69+
if not pypi_asset:
70+
pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {})
71+
72+
if not pypi_asset.package_json and not pypi_asset.download(dest=""):
73+
return "", RepoFinderInfo.PYPI_HTTP_ERROR
74+
75+
if not from_metadata and pypi_info:
76+
# Save the asset for later use.
77+
pypi_info.metadata.append(pypi_asset)
78+
79+
url_dict = pypi_asset.get_project_links()
80+
if not url_dict:
81+
return "", RepoFinderInfo.PYPI_JSON_ERROR
82+
83+
# Look for the repository URL.
84+
fixed_url = find_valid_repository_url(url_dict.values())
85+
if not fixed_url:
86+
return "", RepoFinderInfo.PYPI_NO_URLS
87+
88+
logger.debug("Found repository URL from PyPI: %s", fixed_url)
89+
pypi_asset.has_repository = True
90+
return fixed_url, RepoFinderInfo.FOUND_FROM_PYPI

src/macaron/repo_finder/repo_validator.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved.
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
33

44
"""This module exists to validate URLs in terms of their use as a repository that can be analyzed."""

0 commit comments

Comments
 (0)