-
Notifications
You must be signed in to change notification settings - Fork 505
Support url-safe base64 secrets #245
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
e10b9a3
de2cbd8
d61baab
e1fa566
0c9e97e
ab78151
3d0dc36
2cfea37
0115efd
b402f51
ece342b
488334f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,32 +3,13 @@ | |
This abstraction allows for development of later ML work, or further | ||
heuristical determinations (e.g. word filter, entropy comparator). | ||
""" | ||
import re | ||
import string | ||
|
||
from detect_secrets.util import is_python_2 | ||
|
||
|
||
def is_false_positive(secret, automaton): | ||
""" | ||
:type secret: str | ||
|
||
:type automaton: ahocorasick.Automaton|None | ||
:param automaton: optional automaton for ignoring certain words. | ||
|
||
:rtype: bool | ||
Returns True if any false positive heuristic function returns True. | ||
""" | ||
return any( | ||
func(secret, automaton) | ||
for func in | ||
( | ||
_is_found_with_aho_corasick, | ||
_is_sequential_string, | ||
) | ||
) | ||
|
||
|
||
def _is_found_with_aho_corasick(secret, automaton): | ||
def is_found_with_aho_corasick(secret, automaton): | ||
""" | ||
:type secret: str | ||
|
||
|
@@ -53,7 +34,7 @@ def _is_found_with_aho_corasick(secret, automaton): | |
return False | ||
|
||
|
||
def _is_sequential_string(secret, *args): | ||
def is_sequential_string(secret, *args): | ||
""" | ||
:type secret: str | ||
|
||
|
@@ -97,3 +78,100 @@ def _is_sequential_string(secret, *args): | |
return True | ||
|
||
return False | ||
|
||
|
||
# This only finds UUIDs which only have lowercase characters. | ||
_UUID_REGEX = re.compile(r'[a-f0-9]{8}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{12}') | ||
|
||
|
||
def is_potential_uuid(secret, *args): | ||
""" | ||
Determines if a potential secret contains any UUIDs. | ||
|
||
:type secret: str | ||
|
||
:rtype: bool | ||
Returns True if the string has a UUID, false otherwise. | ||
""" | ||
|
||
# Using a regex to find strings that look like false-positives | ||
# will find us more false-positives than if we just tried validate | ||
# the input string as a UUID (for example, if the string has a prefix | ||
# or suffix). | ||
return len(_UUID_REGEX.findall(secret.lower())) > 0 | ||
|
||
|
||
DEFAULT_FALSE_POSITIVE_HEURISTICS = [ | ||
is_found_with_aho_corasick, | ||
is_sequential_string, | ||
] | ||
|
||
|
||
# NOTE: this doesn't handle multiple key-values on a line properly. | ||
# NOTE: words that end in "id" will be treated as ids | ||
_ID_DETECTOR_REGEX = re.compile(r'[iI][dD][^A-Za-z0-9]') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might be able to do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess it depends on whether we want to ignore keys like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good point, we do have a lot of python biases. |
||
|
||
|
||
def is_likely_id_string(secret, line): | ||
""" | ||
:type secret: str | ||
|
||
:type line: str | ||
:param line: Line context for the plaintext secret | ||
|
||
:rtype: bool | ||
Returns true if the secret could be an id, false otherwise. | ||
""" | ||
if secret not in line: | ||
return False | ||
|
||
secret_index = line.index(secret) | ||
return _ID_DETECTOR_REGEX.findall(line, pos=0, endpos=secret_index) | ||
|
||
|
||
DEFAULT_FALSE_POSITIVE_WITH_LINE_CONTEXT_HEURISTICS = [ | ||
is_likely_id_string, | ||
OiCMudkips marked this conversation as resolved.
Show resolved
Hide resolved
|
||
] | ||
|
||
|
||
def is_false_positive(secret, automaton, functions=DEFAULT_FALSE_POSITIVE_HEURISTICS): | ||
""" | ||
:type secret: str | ||
|
||
:type automaton: ahocorasick.Automaton|None | ||
:param automaton: optional automaton for ignoring certain words. | ||
|
||
:type functions: Iterable[Callable] | ||
:param functions: list of heuristics to use | ||
|
||
:rtype: bool | ||
Returns True if any false positive heuristic function returns True. | ||
""" | ||
return any( | ||
func(secret, automaton) | ||
for func in functions | ||
) | ||
|
||
|
||
def is_false_positive_with_line_context( | ||
secret, | ||
line, | ||
functions=DEFAULT_FALSE_POSITIVE_WITH_LINE_CONTEXT_HEURISTICS, | ||
): | ||
""" | ||
:type secret: str | ||
|
||
:type line: str | ||
:param line: plaintext line on which secret was found | ||
|
||
:type functions: Iterable[Callable] | ||
:param functions: list of heuristics to use | ||
|
||
:rtype: bool | ||
Returns True if any false-positive heuristic which considers the whole file line | ||
returns true. | ||
""" | ||
return any( | ||
func(secret, line) | ||
for func in functions | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,9 @@ | |
from .common.filetype import determine_file_type | ||
from .common.filetype import FileType | ||
from .common.filters import is_false_positive | ||
from .common.filters import is_false_positive_with_line_context | ||
from .common.filters import is_potential_uuid | ||
from .common.filters import DEFAULT_FALSE_POSITIVE_HEURISTICS | ||
from .common.ini_file_parser import IniFileParser | ||
from .common.yaml_file_parser import YamlFileParser | ||
from detect_secrets.core.potential_secret import PotentialSecret | ||
|
@@ -83,14 +86,40 @@ def calculate_shannon_entropy(self, data): | |
|
||
return entropy | ||
|
||
@staticmethod | ||
def _filter_false_positives_with_line_ctx(potential_secrets, line): | ||
return { | ||
key: value for key, value in potential_secrets.items() | ||
if not is_false_positive_with_line_context( | ||
key.secret_value, | ||
line, | ||
) | ||
} | ||
|
||
def analyze_line(self, string, line_num, filename): | ||
output = super(HighEntropyStringsPlugin, self).analyze_line( | ||
string, | ||
line_num, | ||
filename, | ||
) | ||
|
||
return self._filter_false_positives_with_line_ctx( | ||
output, | ||
string, | ||
) | ||
|
||
def analyze_string_content(self, string, line_num, filename): | ||
"""Searches string for custom pattern, and captures all high entropy strings that | ||
match self.regex, with a limit defined as self.entropy_limit. | ||
""" | ||
output = {} | ||
|
||
for result in self.secret_generator(string): | ||
if is_false_positive(result, self.automaton): | ||
# py2+py3 compatible way of copying a list | ||
functions = list(DEFAULT_FALSE_POSITIVE_HEURISTICS) | ||
functions.append(is_potential_uuid) | ||
|
||
if is_false_positive(result, self.automaton, functions=functions): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are your thoughts on passing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was actually thinking about moving i.e. in code
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That sounds great to me 🎈 I'm only unsure of the |
||
continue | ||
|
||
secret = PotentialSecret(self.secret_type, filename, result, line_num) | ||
|
@@ -114,7 +143,7 @@ def adhoc_scan(self, string): | |
# Since it's an individual string, it's just bad UX to require quotes | ||
# around the expected secret. | ||
with self.non_quoted_string_regex(): | ||
results = self.analyze_string( | ||
results = self.analyze_line( | ||
string, | ||
line_num=0, | ||
filename='does_not_matter', | ||
|
@@ -152,23 +181,27 @@ def _analyze_ini_file(self, add_header=False): | |
:returns: same format as super().analyze() | ||
""" | ||
def wrapped(file, filename): | ||
potential_secrets = {} | ||
output = {} | ||
|
||
with self.non_quoted_string_regex(): | ||
for value, lineno in IniFileParser( | ||
for key, value, lineno in IniFileParser( | ||
file, | ||
add_header, | ||
exclude_lines_regex=self.exclude_lines_regex, | ||
).iterator(): | ||
potential_secrets.update( | ||
self.analyze_string( | ||
value, | ||
lineno, | ||
filename, | ||
), | ||
potential_secrets = self.analyze_string_content( | ||
value, | ||
lineno, | ||
filename, | ||
) | ||
line = u'{key}={value}'.format(key=key, value=value) | ||
potential_secrets = self._filter_false_positives_with_line_ctx( | ||
potential_secrets, | ||
line, | ||
) | ||
output.update(potential_secrets) | ||
|
||
return potential_secrets | ||
return output | ||
|
||
return wrapped | ||
|
||
|
@@ -217,7 +250,7 @@ def _analyze_yaml_file(self, file, filename): | |
else item['__value__'] | ||
) | ||
|
||
secrets = self.analyze_string( | ||
secrets = self.analyze_string_content( | ||
string_to_scan, | ||
item['__line__'], | ||
filename, | ||
|
@@ -226,6 +259,15 @@ def _analyze_yaml_file(self, file, filename): | |
if item['__is_binary__']: | ||
secrets = self._encode_yaml_binary_secrets(secrets) | ||
|
||
dumped_key_value = yaml.dump({ | ||
item['__original_key__']: item['__value__'], | ||
}).replace('\n', '') | ||
|
||
secrets = self._filter_false_positives_with_line_ctx( | ||
secrets, | ||
dumped_key_value, | ||
) | ||
|
||
potential_secrets.update(secrets) | ||
|
||
return potential_secrets | ||
|
@@ -339,8 +381,15 @@ class Base64HighEntropyString(HighEntropyStringsPlugin): | |
secret_type = 'Base64 High Entropy String' | ||
|
||
def __init__(self, base64_limit, exclude_lines_regex=None, automaton=None, **kwargs): | ||
charset = ( | ||
string.ascii_letters | ||
+ string.digits | ||
+ '+/' # Regular base64 | ||
+ '\\-_' # Url-safe base64 | ||
+ '=' # Padding | ||
) | ||
super(Base64HighEntropyString, self).__init__( | ||
charset=string.ascii_letters + string.digits + '+/=', | ||
charset=charset, | ||
limit=base64_limit, | ||
exclude_lines_regex=exclude_lines_regex, | ||
automaton=automaton, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
credentials: | ||
some_value_here: not_a_secret | ||
some_value_here: not_secret | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was this necessary b/c the entropy calculation with the new chars alerted on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I don't think we need to be too concerned though because we now have the wordlist filtering. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm more concerned that, we'll have large diffs in baseline's when people update detect-secrets. This isn't as concerning as changing a secret type like we did in #26, (where all old secrets were removed and re-added), but it is a little, especially if it reduces TP's to some extent. (We'll see what the data says though, I can't really say how it'll effect signal.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why will we have large diffs? A lot of new secrets? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If all the e.g. |
||
other_value_here: 1234567890a | ||
CanonicalUserGetSkippedByExcludeLines: 1234567890ab | ||
nested: | ||
|
@@ -11,5 +11,9 @@ list_of_keys: | |
- 234567890a | ||
|
||
test_agent::allowlisted_api_key: 'ToCynx5Se4e2PtoZxEhW7lUJcOX15c54' # pragma: allowlist secret | ||
|
||
high_entropy_binary_secret: !!binary MjNjcnh1IDJieXJpdXYyeXJpaTJidnl1MnI4OXkyb3UwMg== | ||
|
||
# this should be ignored as a potential id | ||
allowlisted_id: 'ToCynx5Se4e2PtoZxEhW7lUJcOX15c54' | ||
|
||
uuid_should_be_ignored: '203db13e-70c7-462b-9a3d-bf32640cb0be' |
Uh oh!
There was an error while loading. Please reload this page.