Skip to content

Commit f9a60f4

Browse files
AmineAmine
Amine
authored and
Amine
committed
feat(security): Add package name typosquatting detection
Implement typosquatting detection for package names during analysis. Compares package names against a list of popular packages using the Jaro-Winkler similarity algorithm. Packages exceeding a defined threshold of similarity to a popular package are flagged. Signed-off-by: Amine <[email protected]>
1 parent c32d340 commit f9a60f4

File tree

6 files changed

+5276
-0
lines changed

6 files changed

+5276
-0
lines changed

src/macaron/__main__.py

+11
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,16 @@ def main(argv: list[str] | None = None) -> None:
367367
help="The directory where Macaron looks for already cloned repositories.",
368368
)
369369

370+
main_parser.add_argument(
371+
"-pp",
372+
"--popular-packages-path",
373+
required=False,
374+
type=str,
375+
default=None,
376+
help="The path to the popular packages file used for typosquatting detection.",
377+
dest="popular_packages_path",
378+
)
379+
370380
# Add sub parsers for each action.
371381
sub_parser = main_parser.add_subparsers(dest="action", help="Run macaron <action> --help for help")
372382

@@ -579,6 +589,7 @@ def main(argv: list[str] | None = None) -> None:
579589
build_log_path=os.path.join(args.output_dir, "build_log"),
580590
debug_level=log_level,
581591
local_repos_path=args.local_repos_path,
592+
popular_packages_path=args.popular_packages_path,
582593
resources_path=os.path.join(macaron.MACARON_PATH, "resources"),
583594
)
584595

src/macaron/config/global_config.py

+5
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ class GlobalConfig:
4949
#: The path to the local .m2 Maven repository. This attribute is None if there is no available .m2 directory.
5050
local_maven_repo: str | None = None
5151

52+
#: The path to the popular packages file.
53+
popular_packages_path: str | None = None
54+
5255
def load(
5356
self,
5457
macaron_path: str,
@@ -57,6 +60,7 @@ def load(
5760
debug_level: int,
5861
local_repos_path: str,
5962
resources_path: str,
63+
popular_packages_path: str,
6064
) -> None:
6165
"""Initiate the GlobalConfig object.
6266
@@ -81,6 +85,7 @@ def load(
8185
self.debug_level = debug_level
8286
self.local_repos_path = local_repos_path
8387
self.resources_path = resources_path
88+
self.popular_packages_path = popular_packages_path
8489

8590
def load_expectation_files(self, exp_path: str) -> None:
8691
"""

src/macaron/malware_analyzer/pypi_heuristics/heuristics.py

+3
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ class Heuristics(str, Enum):
3737
#: Indicates that the package has an unusually large version number for a single release.
3838
ANOMALOUS_VERSION = "anomalous_version"
3939

40+
#: Indicates that the package name is similar to a popular package.
41+
TYPOSQUATTING_PRESENCE = "typosquatting_presence"
42+
4043

4144
class HeuristicResult(str, Enum):
4245
"""Result type indicating the outcome of a heuristic."""
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""Analyzer checks if there is typosquatting presence in the package name."""
5+
import logging
6+
import os
7+
8+
from macaron.config.global_config import global_config
9+
from macaron.json_tools import JsonType
10+
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
11+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
12+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
13+
14+
logger = logging.getLogger(__name__)
15+
16+
17+
class TyposquattingPresenceAnalyzer(BaseHeuristicAnalyzer):
18+
"""Check whether the PyPI package has typosquatting presence."""
19+
20+
def __init__(self) -> None:
21+
super().__init__(
22+
name="typosquatting_presence_analyzer", heuristic=Heuristics.TYPOSQUATTING_PRESENCE, depends_on=None
23+
)
24+
self.popular_packages_path = os.path.join(global_config.resources_path, "popular_packages.txt")
25+
self.distance_ratio_threshold = 0.95
26+
self.cost = 1
27+
self.scaling = 0.15
28+
self.keyboard = 0.8
29+
self.keyboard_layout = {
30+
"1": (-1, 0),
31+
"2": (-1, 1),
32+
"3": (-1, 2),
33+
"4": (-1, 3),
34+
"5": (-1, 4),
35+
"6": (-1, 5),
36+
"7": (-1, 6),
37+
"8": (-1, 7),
38+
"9": (-1, 8),
39+
"0": (-1, 9),
40+
"-": (-1, 10),
41+
"q": (0, 0),
42+
"w": (0, 1),
43+
"e": (0, 2),
44+
"r": (0, 3),
45+
"t": (0, 4),
46+
"y": (0, 5),
47+
"u": (0, 6),
48+
"i": (0, 7),
49+
"o": (0, 8),
50+
"p": (0, 9),
51+
"a": (1, 0),
52+
"s": (1, 1),
53+
"d": (1, 2),
54+
"f": (1, 3),
55+
"g": (1, 4),
56+
"h": (1, 5),
57+
"j": (1, 6),
58+
"k": (1, 7),
59+
"l": (1, 8),
60+
"z": (2, 0),
61+
"x": (2, 1),
62+
"c": (2, 2),
63+
"v": (2, 3),
64+
"b": (2, 4),
65+
"n": (2, 5),
66+
"m": (2, 6),
67+
}
68+
69+
if global_config.popular_packages_path is not None:
70+
self.popular_packages_path = global_config.popular_packages_path
71+
72+
def are_neighbors(self, char1: str, char2: str) -> bool:
73+
"""Check if two characters are adjacent on a QWERTY keyboard.
74+
75+
Parameters
76+
----------
77+
char1 : str
78+
The first character.
79+
char2 : str
80+
The second character.
81+
82+
Returns
83+
-------
84+
bool
85+
True if the characters are neighbors, False otherwise.
86+
"""
87+
c1 = self.keyboard_layout.get(char1)
88+
c2 = self.keyboard_layout.get(char2)
89+
if not c1 or not c2:
90+
return False
91+
return (abs(c1[0] - c2[0]) <= 1) and (abs(c1[1] - c2[1]) <= 1)
92+
93+
def substitution_func(self, char1: str, char2: str) -> float:
94+
"""Calculate the substitution cost between two characters.
95+
96+
Parameters
97+
----------
98+
char1 : str
99+
The first character.
100+
char2 : str
101+
The second character.
102+
103+
Returns
104+
-------
105+
float
106+
0.0 if the characters are the same, `self.keyboard` if they are
107+
neighbors on a QWERTY keyboard, and `self.cost` otherwise.
108+
"""
109+
if char1 == char2:
110+
return 0.0
111+
if self.keyboard and self.are_neighbors(char1, char2):
112+
return self.keyboard
113+
return self.cost
114+
115+
def jaro_distance(self, package_name: str, popular_package_name: str) -> float:
116+
"""Calculate the Jaro distance between two package names.
117+
118+
Parameters
119+
----------
120+
package_name : str
121+
The name of the package being analyzed.
122+
popular_package_name : str
123+
The name of a popular package to compare against.
124+
125+
Returns
126+
-------
127+
float
128+
The Jaro distance between the two package names.
129+
"""
130+
if package_name == popular_package_name:
131+
return 1.0
132+
133+
len1, len2 = len(package_name), len(popular_package_name)
134+
if len1 == 0 or len2 == 0:
135+
return 0.0
136+
137+
match_distance = max(len1, len2) // 2 - 1
138+
139+
package_name_matches = [False] * len1
140+
popular_package_name_matches = [False] * len2
141+
matches = 0
142+
transpositions = 0.0 # Now a float to handle partial costs
143+
144+
# Count matches
145+
for i in range(len1):
146+
start = max(0, i - match_distance)
147+
end = min(i + match_distance + 1, len2)
148+
for j in range(start, end):
149+
if popular_package_name_matches[j]:
150+
continue
151+
if package_name[i] == popular_package_name[j]:
152+
package_name_matches[i] = True
153+
popular_package_name_matches[j] = True
154+
matches += 1
155+
break
156+
157+
if matches == 0:
158+
return 0.0
159+
160+
# Count transpositions with possible keyboard awareness
161+
k = 0
162+
for i in range(len1):
163+
if package_name_matches[i]:
164+
while not popular_package_name_matches[k]:
165+
k += 1
166+
if package_name[i] != popular_package_name[k]:
167+
transpositions += self.substitution_func(package_name[i], popular_package_name[k])
168+
k += 1
169+
170+
transpositions /= 2.0 # Adjust for transpositions being counted twice
171+
172+
return (matches / len1 + matches / len2 + (matches - transpositions) / matches) / 3.0
173+
174+
def ratio(self, package_name: str, popular_package_name: str) -> float:
175+
"""Calculate the Jaro-Winkler distance ratio.
176+
177+
Parameters
178+
----------
179+
package_name : str
180+
The name of the package being analyzed.
181+
popular_package_name : str
182+
The name of a popular package to compare against.
183+
184+
Returns
185+
-------
186+
float
187+
The Jaro-Winkler distance ratio, incorporating a prefix bonus
188+
for common initial characters.
189+
"""
190+
scaling = self.scaling
191+
jaro_dist = self.jaro_distance(package_name, popular_package_name)
192+
prefix_length = 0
193+
max_prefix = 4
194+
for i in range(min(max_prefix, len(package_name), len(popular_package_name))):
195+
if package_name[i] == popular_package_name[i]:
196+
prefix_length += 1
197+
else:
198+
break
199+
200+
return jaro_dist + prefix_length * scaling * (1 - jaro_dist)
201+
202+
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
203+
"""Analyze the package.
204+
205+
Parameters
206+
----------
207+
pypi_package_json: PyPIPackageJsonAsset
208+
The PyPI package JSON asset object.
209+
210+
Returns
211+
-------
212+
tuple[HeuristicResult, dict[str, JsonType]]:
213+
The result and related information collected during the analysis.
214+
"""
215+
# If there is a popular packages file, check if the package name is similar to any of them
216+
package_name = pypi_package_json.component_name
217+
if not self.popular_packages_path or not os.path.exists(self.popular_packages_path):
218+
err_msg = f"Popular packages file not found or path not configured: {self.popular_packages_path}"
219+
logger.warning("%s. Skipping typosquatting check.", err_msg)
220+
return HeuristicResult.SKIP, {"error": err_msg}
221+
222+
popular_packages = []
223+
try:
224+
with open(self.popular_packages_path, encoding="utf-8") as file:
225+
popular_packages = file.read().splitlines()
226+
except OSError as e:
227+
err_msg = f"Could not read popular packages file {self.popular_packages_path}: {e}"
228+
logger.error(err_msg)
229+
return HeuristicResult.SKIP, {"error": err_msg}
230+
231+
for popular_package in popular_packages:
232+
if package_name == popular_package:
233+
return HeuristicResult.PASS, {"package_name": package_name}
234+
235+
distance_ratio = self.ratio(package_name, popular_package)
236+
if distance_ratio >= self.distance_ratio_threshold:
237+
logger.info(
238+
"Potential typosquatting detected: '%s' is similar to popular package '%s' (ratio: %.3f)",
239+
package_name,
240+
popular_package,
241+
distance_ratio,
242+
)
243+
return HeuristicResult.FAIL, {
244+
"package_name": package_name,
245+
"popular_package": popular_package,
246+
"similarity_ratio": distance_ratio,
247+
}
248+
249+
return HeuristicResult.PASS, {"package_name": package_name}

0 commit comments

Comments
 (0)