Yelp · OiCMudkips · Oct 24, 2019 · Sep 24, 2019 · Sep 24, 2019 · Sep 26, 2019
diff --git a/detect_secrets/core/secrets_collection.py b/detect_secrets/core/secrets_collection.py
@@ -358,7 +358,7 @@ def _extract_secrets_from_patch(self, f, plugin, filename):
             for line in chunk.target_lines():
                 if line.is_added:
                     output.update(
-                        plugin.analyze_string(
+                        plugin.analyze_line(
                             line.value,
                             line.target_line_no,
                             filename,

diff --git a/detect_secrets/plugins/base.py b/detect_secrets/plugins/base.py
@@ -97,7 +97,7 @@ def analyze(self, file, filename):
         potential_secrets = {}
         file_lines = tuple(file.readlines())
         for line_num, line in enumerate(file_lines, start=1):
-            results = self.analyze_string(line, line_num, filename)
+            results = self.analyze_line(line, line_num, filename)
             if not self.should_verify:
                 potential_secrets.update(results)
                 continue
@@ -121,7 +121,7 @@ def analyze(self, file, filename):
 
         return potential_secrets
 
-    def analyze_string(self, string, line_num, filename):
+    def analyze_line(self, string, line_num, filename):
         """
         :param string:    string; the line to analyze
         :param line_num:  integer; line number that is currently being analyzed
@@ -163,7 +163,7 @@ def analyze_string_content(self, string, line_num, filename):
     @abstractmethod
     def secret_generator(self, string, *args, **kwargs):
         """Flags secrets in a given string, and yields the raw secret value.
-        Used in self.analyze_string for PotentialSecret creation.
+        Used in self.analyze_line for PotentialSecret creation.
 
         :type string: str
         :param string: the secret to scan
@@ -178,7 +178,7 @@ def adhoc_scan(self, string):
         check what different plugins say regarding a single line/secret. This
         supports that.
 
-        This is very similar to self.analyze_string, but allows the flexibility
+        This is very similar to self.analyze_line, but allows the flexibility
         for subclasses to add any other notable info (rather than just a
         PotentialSecret type). e.g. HighEntropyStrings adds their Shannon
         entropy in which they made their decision.
@@ -191,7 +191,7 @@ def adhoc_scan(self, string):
             <classname>: <returned-value>
         """
         # TODO: Handle multiple secrets on single line.
-        results = self.analyze_string(
+        results = self.analyze_line(
             string,
             line_num=0,
             filename='does_not_matter',

diff --git a/detect_secrets/plugins/common/filters.py b/detect_secrets/plugins/common/filters.py
@@ -3,32 +3,13 @@
 This abstraction allows for development of later ML work, or further
 heuristical determinations (e.g. word filter, entropy comparator).
 """
+import re
 import string
 
 from detect_secrets.util import is_python_2
 
 
-def is_false_positive(secret, automaton):
-    """
-    :type secret: str
-
-    :type automaton: ahocorasick.Automaton|None
-    :param automaton: optional automaton for ignoring certain words.
-
-    :rtype: bool
-    Returns True if any false positive heuristic function returns True.
-    """
-    return any(
-        func(secret, automaton)
-        for func in
-        (
-            _is_found_with_aho_corasick,
-            _is_sequential_string,
-        )
-    )
-
-
-def _is_found_with_aho_corasick(secret, automaton):
+def is_found_with_aho_corasick(secret, automaton):
     """
     :type secret: str
 
@@ -53,7 +34,7 @@ def _is_found_with_aho_corasick(secret, automaton):
         return False
 
 
-def _is_sequential_string(secret, *args):
+def is_sequential_string(secret, *args):
     """
     :type secret: str
 
@@ -97,3 +78,100 @@ def _is_sequential_string(secret, *args):
             return True
 
     return False
+
+
+# This only finds UUIDs which only have lowercase characters.
+_UUID_REGEX = re.compile(r'[a-f0-9]{8}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{12}')
+
+
+def is_potential_uuid(secret, *args):
+    """
+    Determines if a potential secret contains any UUIDs.
+
+    :type secret: str
+
+    :rtype: bool
+    Returns True if the string has a UUID, false otherwise.
+    """
+
+    # Using a regex to find strings that look like false-positives
+    # will find us more false-positives than if we just tried validate
+    # the input string as a UUID (for example, if the string has a prefix
+    # or suffix).
+    return len(_UUID_REGEX.findall(secret.lower())) > 0
+
+
+DEFAULT_FALSE_POSITIVE_HEURISTICS = [
+    is_found_with_aho_corasick,
+    is_sequential_string,
+]
+
+
+# NOTE: this doesn't handle multiple key-values on a line properly.
+# NOTE: words that end in "id" will be treated as ids
+_ID_DETECTOR_REGEX = re.compile(r'[iI][dD][^A-Za-z0-9]')
+
+
+def is_likely_id_string(secret, line):
+    """
+    :type secret: str
+
+    :type line: str
+    :param line: Line context for the plaintext secret
+
+    :rtype: bool
+    Returns true if the secret could be an id, false otherwise.
+    """
+    if secret not in line:
+        return False
+
+    secret_index = line.index(secret)
+    return _ID_DETECTOR_REGEX.findall(line, pos=0, endpos=secret_index)
+
+
+DEFAULT_FALSE_POSITIVE_WITH_LINE_CONTEXT_HEURISTICS = [
+    is_likely_id_string,
+]
+
+
+def is_false_positive(secret, automaton, functions=DEFAULT_FALSE_POSITIVE_HEURISTICS):
+    """
+    :type secret: str
+
+    :type automaton: ahocorasick.Automaton|None
+    :param automaton: optional automaton for ignoring certain words.
+
+    :type functions: Iterable[Callable]
+    :param functions: list of heuristics to use
+
+    :rtype: bool
+    Returns True if any false positive heuristic function returns True.
+    """
+    return any(
+        func(secret, automaton)
+        for func in functions
+    )
+
+
+def is_false_positive_with_line_context(
+    secret,
+    line,
+    functions=DEFAULT_FALSE_POSITIVE_WITH_LINE_CONTEXT_HEURISTICS,
+):
+    """
+    :type secret: str
+
+    :type line: str
+    :param line: plaintext line on which secret was found
+
+    :type functions: Iterable[Callable]
+    :param functions: list of heuristics to use
+
+    :rtype: bool
+    Returns True if any false-positive heuristic which considers the whole file line
+    returns true.
+    """
+    return any(
+        func(secret, line)
+        for func in functions
+    )
diff --git a/detect_secrets/plugins/common/ini_file_parser.py b/detect_secrets/plugins/common/ini_file_parser.py
@@ -77,7 +77,7 @@ def iterator(self):
                     key,
                     values,
                 ):
-                    yield value, offset
+                    yield key, value, offset
 
     def _get_value_and_line_offset(self, key, values):
         """Returns the index of the location of key, value pair in lines.

diff --git a/detect_secrets/plugins/common/yaml_file_parser.py b/detect_secrets/plugins/common/yaml_file_parser.py
@@ -100,6 +100,11 @@ def _tag_dict_values(self, map_node):
                         value=str(value.tag.endswith(':binary')),
                         tag='tag:yaml.org,2002:bool',
                     ),
+                    self._create_key_value_pair_for_mapping_node_value(
+                        key='__original_key__',
+                        value=key.value,
+                        tag='tag:yaml.org,2002:str',
+                    ),
                 ],
             )
 

diff --git a/detect_secrets/plugins/high_entropy_strings.py b/detect_secrets/plugins/high_entropy_strings.py
@@ -19,6 +19,9 @@
 from .common.filetype import determine_file_type
 from .common.filetype import FileType
 from .common.filters import is_false_positive
+from .common.filters import is_false_positive_with_line_context
+from .common.filters import is_potential_uuid
+from .common.filters import DEFAULT_FALSE_POSITIVE_HEURISTICS
 from .common.ini_file_parser import IniFileParser
 from .common.yaml_file_parser import YamlFileParser
 from detect_secrets.core.potential_secret import PotentialSecret
@@ -83,14 +86,40 @@ def calculate_shannon_entropy(self, data):
 
         return entropy
 
+    @staticmethod
+    def _filter_false_positives_with_line_ctx(potential_secrets, line):
+        return {
+            key: value for key, value in potential_secrets.items()
+            if not is_false_positive_with_line_context(
+                key.secret_value,
+                line,
+            )
+        }
+
+    def analyze_line(self, string, line_num, filename):
+        output = super(HighEntropyStringsPlugin, self).analyze_line(
+            string,
+            line_num,
+            filename,
+        )
+
+        return self._filter_false_positives_with_line_ctx(
+            output,
+            string,
+        )
+
     def analyze_string_content(self, string, line_num, filename):
         """Searches string for custom pattern, and captures all high entropy strings that
         match self.regex, with a limit defined as self.entropy_limit.
         """
         output = {}
 
         for result in self.secret_generator(string):
-            if is_false_positive(result, self.automaton):
+            # py2+py3 compatible way of copying a list
+            functions = list(DEFAULT_FALSE_POSITIVE_HEURISTICS)
+            functions.append(is_potential_uuid)
+
+            if is_false_positive(result, self.automaton, functions=functions):
                 continue
 
             secret = PotentialSecret(self.secret_type, filename, result, line_num)
@@ -114,7 +143,7 @@ def adhoc_scan(self, string):
         # Since it's an individual string, it's just bad UX to require quotes
         # around the expected secret.
         with self.non_quoted_string_regex():
-            results = self.analyze_string(
+            results = self.analyze_line(
                 string,
                 line_num=0,
                 filename='does_not_matter',
@@ -152,23 +181,27 @@ def _analyze_ini_file(self, add_header=False):
         :returns: same format as super().analyze()
         """
         def wrapped(file, filename):
-            potential_secrets = {}
+            output = {}
 
             with self.non_quoted_string_regex():
-                for value, lineno in IniFileParser(
+                for key, value, lineno in IniFileParser(
                     file,
                     add_header,
                     exclude_lines_regex=self.exclude_lines_regex,
                 ).iterator():
-                    potential_secrets.update(
-                        self.analyze_string(
-                            value,
-                            lineno,
-                            filename,
-                        ),
+                    potential_secrets = self.analyze_string_content(
+                        value,
+                        lineno,
+                        filename,
+                    )
+                    line = u'{key}={value}'.format(key=key, value=value)
+                    potential_secrets = self._filter_false_positives_with_line_ctx(
+                        potential_secrets,
+                        line,
                     )
+                    output.update(potential_secrets)
 
-            return potential_secrets
+            return output
 
         return wrapped
 
@@ -217,7 +250,7 @@ def _analyze_yaml_file(self, file, filename):
                     else item['__value__']
                 )
 
-                secrets = self.analyze_string(
+                secrets = self.analyze_string_content(
                     string_to_scan,
                     item['__line__'],
                     filename,
@@ -226,6 +259,15 @@ def _analyze_yaml_file(self, file, filename):
                 if item['__is_binary__']:
                     secrets = self._encode_yaml_binary_secrets(secrets)
 
+                dumped_key_value = yaml.dump({
+                    item['__original_key__']: item['__value__'],
+                }).replace('\n', '')
+
+                secrets = self._filter_false_positives_with_line_ctx(
+                    secrets,
+                    dumped_key_value,
+                )
+
                 potential_secrets.update(secrets)
 
         return potential_secrets
@@ -339,8 +381,15 @@ class Base64HighEntropyString(HighEntropyStringsPlugin):
     secret_type = 'Base64 High Entropy String'
 
     def __init__(self, base64_limit, exclude_lines_regex=None, automaton=None, **kwargs):
+        charset = (
+            string.ascii_letters
+            + string.digits
+            + '+/'  # Regular base64
+            + '\\-_'  # Url-safe base64
+            + '='  # Padding
+        )
         super(Base64HighEntropyString, self).__init__(
-            charset=string.ascii_letters + string.digits + '+/=',
+            charset=charset,
             limit=base64_limit,
             exclude_lines_regex=exclude_lines_regex,
             automaton=automaton,

diff --git a/test_data/config.ini b/test_data/config.ini
@@ -27,3 +27,7 @@ password = 12345678901234  # pragma: allowlist secret
 
 # unicode
 foo=bår
+
+[key with id in name]
+real_secret_which_isnt_an_i_d = vh987tyw9ehy8ghis7vwyhiwbwitefy7w3ASDGYDGUASDG
+foreign_key_id = vh987tyw9ehy8ghis7vwyhiwbwitefy7w3ASDGYDGUASDG
diff --git a/test_data/config.yaml b/test_data/config.yaml
@@ -1,5 +1,5 @@
 credentials:
-    some_value_here: not_a_secret
+    some_value_here: not_secret
     other_value_here: 1234567890a
     CanonicalUserGetSkippedByExcludeLines: 1234567890ab
     nested:
@@ -11,5 +11,9 @@ list_of_keys:
     - 234567890a
 
 test_agent::allowlisted_api_key: 'ToCynx5Se4e2PtoZxEhW7lUJcOX15c54'  # pragma: allowlist secret
-
 high_entropy_binary_secret: !!binary MjNjcnh1IDJieXJpdXYyeXJpaTJidnl1MnI4OXkyb3UwMg==
+
+# this should be ignored as a potential id
+allowlisted_id: 'ToCynx5Se4e2PtoZxEhW7lUJcOX15c54'
+
+uuid_should_be_ignored: '203db13e-70c7-462b-9a3d-bf32640cb0be'
diff --git a/tests/plugins/artifactory_test.py b/tests/plugins/artifactory_test.py
@@ -37,8 +37,8 @@ class TestArtifactoryDetector(object):
             ('artifactory:_password=AKCxxxxxxxx', False),
         ],
     )
-    def test_analyze_string(self, payload, should_flag):
+    def test_analyze_line(self, payload, should_flag):
         logic = ArtifactoryDetector()
 
-        output = logic.analyze_string(payload, 1, 'mock_filename')
+        output = logic.analyze_line(payload, 1, 'mock_filename')
         assert len(output) == int(should_flag)