Skip to content

Commit 11b8768

Browse files
authored
Merge pull request #50 from LouisTrezzini/master
YAML files don't support inline whitelisting
2 parents 867cbaf + 8842526 commit 11b8768

File tree

8 files changed

+322
-266
lines changed

8 files changed

+322
-266
lines changed

detect_secrets/plugins/base.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def __init__(self, **kwargs):
1212
if not self.secret_type:
1313
raise ValueError('Plugins need to declare a secret_type.')
1414

15-
def analyze(self, file, filename): # pragma: no cover
15+
def analyze(self, file, filename):
1616
"""
1717
:param file: The File object itself.
1818
:param filename: string; filename of File object, used for creating
@@ -29,7 +29,7 @@ def analyze(self, file, filename): # pragma: no cover
2929
return potential_secrets
3030

3131
@abstractmethod
32-
def analyze_string(self, string, line_num, filename): # pragma: no cover
32+
def analyze_string(self, string, line_num, filename):
3333
"""
3434
:param string: string; the line to analyze
3535
:param line_num: integer; line number that is currently being analyzed
@@ -38,17 +38,17 @@ def analyze_string(self, string, line_num, filename): # pragma: no cover
3838
3939
NOTE: line_num and filename are used for PotentialSecret creation only.
4040
"""
41-
pass
41+
raise NotImplementedError
4242

4343
@abstractmethod
44-
def secret_generator(self, string): # pragma: no cover
44+
def secret_generator(self, string):
4545
"""Flags secrets in a given string, and yields the raw secret value.
4646
Used in self.analyze_string for PotentialSecret creation.
4747
4848
:type string: str
4949
:param string: the secret to scan
5050
"""
51-
pass
51+
raise NotImplementedError
5252

5353
@property
5454
def __dict__(self):
+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
import re
2+
3+
# TODO: Update for not just python comments?
4+
WHITELIST_REGEX = re.compile(r'# ?pragma: ?whitelist[ -]secret')
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
import configparser
2+
import re
3+
4+
5+
class IniFileParser(object):
6+
7+
def __init__(self, file):
8+
self.parser = configparser.ConfigParser()
9+
self.parser.optionxform = str
10+
self.parser.read_file(file)
11+
12+
# Hacky way to keep track of line location
13+
file.seek(0)
14+
self.lines = list(map(lambda x: x.strip(), file.readlines()))
15+
self.line_offset = 0
16+
17+
def iterator(self):
18+
if not self.parser.sections():
19+
# To prevent cases where it's not an ini file, but the parser
20+
# helpfully attempts to parse everything to a DEFAULT section,
21+
# when not explicitly provided.
22+
raise configparser.Error
23+
24+
for section_name, _ in self.parser.items():
25+
for key, values in self.parser.items(section_name):
26+
for value, offset in self._get_value_and_line_offset(
27+
key,
28+
values,
29+
):
30+
yield value, offset
31+
32+
def _get_value_and_line_offset(self, key, values):
33+
"""Returns the index of the location of key, value pair in lines.
34+
35+
:type key: str
36+
:param key: key, in config file.
37+
38+
:type values: str
39+
:param values: values for key, in config file. This is plural,
40+
because you can have multiple values per key. Eg.
41+
42+
>>> key =
43+
... value1
44+
... value2
45+
46+
:type lines: list
47+
:param lines: a collection of lines-so-far in file
48+
49+
:rtype: list(tuple)
50+
"""
51+
values_list = self._construct_values_list(values)
52+
if not values_list:
53+
return []
54+
55+
current_value_list_index = 0
56+
output = []
57+
lines_modified = False
58+
59+
first_line_regex = re.compile(r'^\s*{}[ :=]+{}'.format(
60+
re.escape(key),
61+
re.escape(values_list[current_value_list_index]),
62+
))
63+
comment_regex = re.compile(r'\s*[;#]')
64+
for index, line in enumerate(self.lines):
65+
if current_value_list_index == 0:
66+
if first_line_regex.match(line):
67+
output.append((
68+
values_list[current_value_list_index],
69+
self.line_offset + index + 1,
70+
))
71+
72+
current_value_list_index += 1
73+
74+
continue
75+
76+
# Check ignored lines before checking values, because
77+
# you can write comments *after* the value.
78+
79+
# Ignore blank lines
80+
if not line.strip():
81+
continue
82+
83+
# Ignore comments
84+
if comment_regex.match(line):
85+
continue
86+
87+
if current_value_list_index == len(values_list):
88+
if index == 0:
89+
index = 1 # don't want to count the same line again
90+
91+
self.line_offset += index
92+
self.lines = self.lines[index:]
93+
lines_modified = True
94+
95+
break
96+
else:
97+
output.append((
98+
values_list[current_value_list_index],
99+
self.line_offset + index + 1,
100+
))
101+
102+
current_value_list_index += 1
103+
104+
if not lines_modified:
105+
# No more lines left, if loop was not explicitly left.
106+
self.lines = []
107+
108+
return output
109+
110+
@staticmethod
111+
def _construct_values_list(values):
112+
"""
113+
This values_list is a strange construction, because of ini format.
114+
We need to extract the values with the following supported format:
115+
116+
>>> key = value0
117+
... value1
118+
...
119+
... # comment line here
120+
... value2
121+
122+
given that normally, either value0 is supplied, or (value1, value2),
123+
but still allowing for all three at once.
124+
125+
Furthermore, with the configparser, we will get a list of values,
126+
and intermediate blank lines, but no comments. This means that we can't
127+
merely use the count of values' items to heuristically "skip ahead" lines,
128+
because we still have to manually parse through this.
129+
130+
Therefore, we construct the values_list in the following fashion:
131+
1. Keep the first value (in the example, this is `value0`)
132+
2. For all other values, ignore blank lines.
133+
Then, we can parse through, and look for values only.
134+
"""
135+
values_list = values.splitlines()
136+
return values_list[:1] + list(
137+
filter(
138+
lambda x: x,
139+
values_list[1:],
140+
),
141+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import yaml
2+
3+
from detect_secrets.plugins.core.constants import WHITELIST_REGEX
4+
5+
6+
class YamlFileParser(object):
7+
"""
8+
Yaml config files are interesting, because they don't necessarily conform
9+
to our basic regex for detecting HighEntropyStrings as strings don't
10+
need to be quoted.
11+
12+
This causes interesting issues, because our regex won't catch non-quoted
13+
strings, and if we ignore the quoting requirement, then we increase our
14+
false positive rate, because any long string would have high entropy.
15+
16+
Therefore, we take a different approach: intercept the parsing of the yaml
17+
file to identify string values. This assumes:
18+
19+
1. Secrets are strings
20+
2. Secrets are not keys
21+
22+
Then, we calculate the entropy of those string values.
23+
24+
The difficulty comes from determining the line number which these values
25+
come from. To do this, we transform the string into a dictionary of
26+
meta-tags, in the following format:
27+
28+
>>> {
29+
'key': {
30+
'__value__': value,
31+
'__line__': <line_number>,
32+
}
33+
}
34+
35+
This way, we can quickly identify the line number for auditing at a later
36+
stage.
37+
38+
This parsing method is inspired by https://stackoverflow.com/a/13319530.
39+
"""
40+
41+
def __init__(self, file):
42+
self.content = file.read()
43+
self.loader = yaml.SafeLoader(self.content)
44+
45+
self.loader.compose_node = self._compose_node_shim
46+
47+
def json(self):
48+
return self.loader.get_single_data()
49+
50+
def _compose_node_shim(self, parent, index):
51+
line = self.loader.line
52+
53+
node = yaml.composer.Composer.compose_node(self.loader, parent, index)
54+
node.__line__ = line + 1
55+
56+
if node.tag.endswith(':map'):
57+
return self._tag_dict_values(node)
58+
59+
# TODO: Not sure if need to do :seq
60+
61+
return node
62+
63+
def _tag_dict_values(self, map_node):
64+
"""
65+
:type map_node: yaml.nodes.MappingNode
66+
:param map_node: It looks like map_node.value contains a list of
67+
pair tuples, corresponding to key,value pairs.
68+
"""
69+
new_values = []
70+
for key, value in map_node.value:
71+
if not value.tag.endswith(':str'):
72+
new_values.append((key, value,))
73+
continue
74+
75+
augmented_string = yaml.nodes.MappingNode(
76+
tag=map_node.tag,
77+
value=[
78+
self._create_key_value_pair_for_mapping_node_value(
79+
'__value__',
80+
value.value,
81+
'tag:yaml.org,2002:str',
82+
),
83+
self._create_key_value_pair_for_mapping_node_value(
84+
'__line__',
85+
str(value.__line__),
86+
'tag:yaml.org,2002:int',
87+
),
88+
],
89+
)
90+
91+
new_values.append((key, augmented_string,))
92+
93+
output = yaml.nodes.MappingNode(
94+
tag=map_node.tag,
95+
value=new_values,
96+
start_mark=map_node.start_mark,
97+
end_mark=map_node.end_mark,
98+
flow_style=map_node.flow_style,
99+
)
100+
return output
101+
102+
@staticmethod
103+
def _create_key_value_pair_for_mapping_node_value(key, value, tag):
104+
return (
105+
yaml.nodes.ScalarNode(
106+
tag='tag:yaml.org,2002:str',
107+
value=key,
108+
),
109+
yaml.nodes.ScalarNode(
110+
tag=tag,
111+
value=value,
112+
),
113+
)
114+
115+
def get_ignored_lines(self):
116+
"""
117+
Return a set of integers that refer to line numbers that were
118+
whitelisted by the user and should be ignored.
119+
120+
We need to parse the file separately from PyYAML parsing because
121+
the parser drops the comments (at least up to version 3.13):
122+
https://github.com/yaml/pyyaml/blob/a2d481b8dbd2b352cb001f07091ccf669227290f/lib3/yaml/scanner.py#L749
123+
124+
:return: set
125+
"""
126+
127+
ignored_lines = set()
128+
129+
for line_number, line in enumerate(self.content.split('\n'), 1):
130+
if WHITELIST_REGEX.search(line):
131+
ignored_lines.add(line_number)
132+
133+
return ignored_lines

0 commit comments

Comments
 (0)