From ff78c557f173fff21bd236e510f023cd979269fe Mon Sep 17 00:00:00 2001 From: Ross Wolf <31489089+rw-access@users.noreply.github.com> Date: Tue, 15 Sep 2020 15:46:47 -0600 Subject: [PATCH 1/7] Add EQL rules and schema validation --- detection_rules/beats.py | 67 +++++++++++++-------- detection_rules/ecs.py | 30 +++++++++ detection_rules/rule.py | 47 ++++++++++++++- detection_rules/schemas/__init__.py | 8 ++- detection_rules/schemas/v7_10.py | 36 +++++++++++ detection_rules/schemas/{v78.py => v7_8.py} | 0 detection_rules/schemas/{v79.py => v7_9.py} | 2 +- tests/test_schemas.py | 34 +++++++++++ 8 files changed, 193 insertions(+), 31 deletions(-) create mode 100644 detection_rules/schemas/v7_10.py rename detection_rules/schemas/{v78.py => v7_8.py} (100%) rename detection_rules/schemas/{v79.py => v7_9.py} (99%) diff --git a/detection_rules/beats.py b/detection_rules/beats.py index 23f3c1a45b7..57f5c3b3de5 100644 --- a/detection_rules/beats.py +++ b/detection_rules/beats.py @@ -6,11 +6,12 @@ import os import kql +import eql import requests import yaml from .semver import Version -from .utils import unzip, load_etc_dump, save_etc_dump, get_etc_path +from .utils import unzip, load_etc_dump, save_etc_dump, get_etc_path, cached def download_latest_beats_schema(): @@ -129,34 +130,16 @@ def get_beats_sub_schema(schema: dict, beat: str, module: str, *datasets: str): return {field["name"]: field for field in sorted(flattened, key=lambda f: f["name"])} -SCHEMA = None - - +@cached def read_beats_schema(): - global SCHEMA - - if SCHEMA is None: - beats_schemas = os.listdir(get_etc_path("beats_schemas")) - latest = max(beats_schemas, key=lambda b: Version(b.lstrip("v"))) + beats_schemas = os.listdir(get_etc_path("beats_schemas")) + latest = max(beats_schemas, key=lambda b: Version(b.lstrip("v"))) - SCHEMA = load_etc_dump("beats_schemas", latest) + return load_etc_dump("beats_schemas", latest) - return SCHEMA - -def get_schema_for_query(tree: kql.ast, beats: list) -> dict: +def get_schema_from_datasets(beats, modules, datasets): filtered = {} - modules = set() - datasets = set() - - # extract out event.module and event.dataset from the query's AST - for node in tree: - if isinstance(node, kql.ast.FieldComparison) and node.field == kql.ast.Field("event.module"): - modules.update(child.value for child in node.value if isinstance(child, kql.ast.String)) - - if isinstance(node, kql.ast.FieldComparison) and node.field == kql.ast.Field("event.dataset"): - datasets.update(child.value for child in node.value if isinstance(child, kql.ast.String)) - beats_schema = read_beats_schema() # infer the module if only a dataset are defined @@ -173,3 +156,39 @@ def get_schema_for_query(tree: kql.ast, beats: list) -> dict: filtered.update(get_beats_sub_schema(beats_schema, beat, module, *datasets)) return filtered + + +def get_schema_from_eql(tree: eql.ast.BaseNode, beats: list) -> dict: + modules = set() + datasets = set() + + # extract out event.module and event.dataset from the query's AST + for node in tree: + if isinstance(node, eql.ast.Comparison) and node.comparator == node.EQ and \ + isinstance(node.right, eql.ast.String): + if node.left == eql.ast.Field("event", ["module"]): + modules.add(node.right.render()) + elif node.left == eql.ast.Field("event", ["dataset"]): + datasets.add(node.right.render()) + elif isinstance(node, eql.ast.InSet): + if node.expression == eql.ast.Field("event", ["module"]): + modules.add(node.get_literals()) + elif node.expression == eql.ast.Field("event", ["dataset"]): + datasets.add(node.get_literals()) + + return get_schema_from_datasets(beats, modules, datasets) + + +def get_schema_from_kql(tree: kql.ast.BaseNode, beats: list) -> dict: + modules = set() + datasets = set() + + # extract out event.module and event.dataset from the query's AST + for node in tree: + if isinstance(node, kql.ast.FieldComparison) and node.field == kql.ast.Field("event.module"): + modules.update(child.value for child in node.value if isinstance(child, kql.ast.String)) + + if isinstance(node, kql.ast.FieldComparison) and node.field == kql.ast.Field("event.dataset"): + datasets.update(child.value for child in node.value if isinstance(child, kql.ast.String)) + + return get_schema_from_datasets(beats, modules, datasets) diff --git a/detection_rules/ecs.py b/detection_rules/ecs.py index 2bb8c7d1621..f3eaaefffe8 100644 --- a/detection_rules/ecs.py +++ b/detection_rules/ecs.py @@ -10,6 +10,8 @@ import json import requests +import eql +import eql.types import yaml from .semver import Version @@ -164,6 +166,34 @@ def flatten_multi_fields(schema): return converted +class KqlSchema2Eql(eql.Schema): + type_mapping = { + "keyword": eql.types.TypeHint.String, + "ip": eql.types.TypeHint.String, + "float": eql.types.TypeHint.Numeric, + "double": eql.types.TypeHint.Numeric, + "long": eql.types.TypeHint.Numeric, + "short": eql.types.TypeHint.Numeric, + } + + def __init__(self, kql_schema): + self.kql_schema = kql_schema + eql.Schema.__init__(self, {}, allow_any=True, allow_generic=False, allow_missing=False) + + def validate_event_type(self, event_type): + # allow all event types to fill in X: + # `X` where .... + return True + + def get_event_type_hint(self, event_type, path): + dotted = ".".join(path) + elasticsearch_type = self.kql_schema.get(dotted) + eql_hint = self.type_mapping.get(elasticsearch_type) + + if eql_hint is not None: + return eql_hint, None + + @cached def get_kql_schema(version=None, indexes=None, beat_schema=None): """Get schema for KQL.""" diff --git a/detection_rules/rule.py b/detection_rules/rule.py index 43f53acad8a..818437c604d 100644 --- a/detection_rules/rule.py +++ b/detection_rules/rule.py @@ -10,6 +10,7 @@ import click import kql +import eql from . import ecs, beats from .attack import TACTICS, build_threat_map_entry, technique_lookup @@ -152,10 +153,50 @@ def validate(self, as_rule=False, versioned=False, query=True): schema_cls.validate(contents, role=self.type) - if query and self.query and self.contents['language'] == 'kuery': + if query and self.query is not None: ecs_versions = self.metadata.get('ecs_version') indexes = self.contents.get("index", []) - self._validate_kql(ecs_versions, indexes, self.query, self.name) + + if self.contents['language'] == 'kuery': + self._validate_kql(ecs_versions, indexes, self.query, self.name) + + if self.contents['language'] == 'eql': + self._validate_eql(ecs_versions, indexes, self.query, self.name) + + @staticmethod + @cached + def _validate_eql(ecs_versions, indexes, query, name): + # validate against all specified schemas or the latest if none specified + parsed = eql.parse_query(query) + beat_types = [index.split("-")[0] for index in indexes if "beat-*" in index] + beat_schema = beats.get_schema_from_kql(parsed, beat_types) if beat_types else None + + ecs_versions = ecs_versions or [ecs_versions] + schemas = [] + + for version in ecs_versions: + try: + schemas.append(ecs.get_kql_schema(indexes=indexes, beat_schema=beat_schema, version=version)) + except KeyError: + raise KeyError('Unknown ecs schema version: {} in rule {}.\n' + 'Do you need to update schemas?'.format(version, name)) from None + + for schema in schemas: + try: + with ecs.KqlSchema2Eql(schema): + eql.parse_query(query) + + except eql.EqlTypeMismatchError: + raise + + except eql.EqlParseError as exc: + message = exc.error_msg + trailer = None + if "Unknown field" in message and beat_types: + trailer = "\nTry adding event.module and event.dataset to specify beats module" + + raise type(exc)(exc.error_msg, exc.line, exc.column, exc.source, + len(exc.caret.lstrip()), trailer=trailer) from None @staticmethod @cached @@ -163,7 +204,7 @@ def _validate_kql(ecs_versions, indexes, query, name): # validate against all specified schemas or the latest if none specified parsed = kql.parse(query) beat_types = [index.split("-")[0] for index in indexes if "beat-*" in index] - beat_schema = beats.get_schema_for_query(parsed, beat_types) if beat_types else None + beat_schema = beats.get_schema_from_kql(parsed, beat_types) if beat_types else None if not ecs_versions: kql.parse(query, schema=ecs.get_kql_schema(indexes=indexes, beat_schema=beat_schema)) diff --git a/detection_rules/schemas/__init__.py b/detection_rules/schemas/__init__.py index 0d74dfa50f4..f402d351252 100644 --- a/detection_rules/schemas/__init__.py +++ b/detection_rules/schemas/__init__.py @@ -7,8 +7,9 @@ from ..semver import Version # import all of the schema versions -from .v78 import ApiSchema78 -from .v79 import ApiSchema79 +from .v7_8 import ApiSchema78 +from .v7_9 import ApiSchema79 +from .v7_10 import ApiSchema710 __all__ = ( "all_schemas", @@ -21,9 +22,10 @@ all_schemas = [ ApiSchema78, ApiSchema79, + ApiSchema710 ] -CurrentSchema = max(all_schemas, key=lambda cls: Version(cls.STACK_VERSION)) +CurrentSchema = all_schemas[-1] def downgrade(api_contents: dict, target_version: str): diff --git a/detection_rules/schemas/v7_10.py b/detection_rules/schemas/v7_10.py new file mode 100644 index 00000000000..6ab0b14bddc --- /dev/null +++ b/detection_rules/schemas/v7_10.py @@ -0,0 +1,36 @@ +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License; +# you may not use this file except in compliance with the Elastic License. + +"""Definitions for rule metadata and schemas.""" + +import jsl +from .v7_9 import ApiSchema79 + + +# rule types +EQL = "eql" + + +class ApiSchema710(ApiSchema79): + """Schema for siem rule in API format.""" + + STACK_VERSION = "7.10" + RULE_TYPES = ApiSchema79.RULE_TYPES + [EQL] + + type = jsl.StringField(enum=RULE_TYPES, required=True) + + # there might be a bug in jsl that requires us to redefine these here + query_scope = ApiSchema79.query_scope + saved_id_scope = ApiSchema79.saved_id_scope + ml_scope = ApiSchema79.ml_scope + threshold_scope = ApiSchema79.threshold_scope + + with jsl.Scope(EQL) as eql_scope: + eql_scope.index = jsl.ArrayField(jsl.StringField(), required=False) + eql_scope.query = jsl.StringField(required=True) + eql_scope.language = jsl.StringField(enum=[EQL], required=True) + eql_scope.type = jsl.StringField(enum=[EQL], required=True) + + with jsl.Scope(jsl.DEFAULT_ROLE) as default_scope: + default_scope.type = type diff --git a/detection_rules/schemas/v78.py b/detection_rules/schemas/v7_8.py similarity index 100% rename from detection_rules/schemas/v78.py rename to detection_rules/schemas/v7_8.py diff --git a/detection_rules/schemas/v79.py b/detection_rules/schemas/v7_9.py similarity index 99% rename from detection_rules/schemas/v79.py rename to detection_rules/schemas/v7_9.py index 0dc8c97fe15..87a2b2d0532 100644 --- a/detection_rules/schemas/v79.py +++ b/detection_rules/schemas/v7_9.py @@ -5,7 +5,7 @@ """Definitions for rule metadata and schemas.""" import jsl -from .v78 import ApiSchema78 +from .v7_8 import ApiSchema78 OPERATORS = ['equals'] diff --git a/tests/test_schemas.py b/tests/test_schemas.py index 225d86a9e1e..08966862c63 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -5,6 +5,7 @@ """Test stack versioned schemas.""" import unittest import uuid +import eql from detection_rules.rule import Rule from detection_rules.schemas import downgrade, CurrentSchema @@ -106,3 +107,36 @@ def test_threshold_downgrade(self): with self.assertRaisesRegex(ValueError, "Unsupported rule type"): downgrade(api_contents, "7.8") + + def test_eql_validation(self): + base_fields = { + "author": ["Elastic"], + "description": "test description", + "index": ["filebeat-*"], + "language": "eql", + "license": "Elastic License", + "name": "test rule", + "risk_score": 21, + "rule_id": str(uuid.uuid4()), + "severity": "low", + "type": "eql" + } + + Rule("test.toml", dict(base_fields, query=""" + process where process.name == "cmd.exe" + """)) + + with self.assertRaises(eql.EqlSyntaxError): + Rule("test.toml", dict(base_fields, query=""" + process where process.name == this!is$not#v@lid + """)) + + with self.assertRaises(eql.EqlSemanticError): + Rule("test.toml", dict(base_fields, query=""" + process where process.invalid_field == "hello world" + """)) + + with self.assertRaises(eql.EqlTypeMismatchError): + Rule("test.toml", dict(base_fields, query=""" + process where process.pid == "some string field" + """)) From 57976892cb0444fe11998fa02b232418fefc8e32 Mon Sep 17 00:00:00 2001 From: Ross Wolf <31489089+rw-access@users.noreply.github.com> Date: Tue, 15 Sep 2020 15:50:54 -0600 Subject: [PATCH 2/7] Lint nitpick --- detection_rules/schemas/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detection_rules/schemas/__init__.py b/detection_rules/schemas/__init__.py index f402d351252..d29352d6a60 100644 --- a/detection_rules/schemas/__init__.py +++ b/detection_rules/schemas/__init__.py @@ -22,7 +22,7 @@ all_schemas = [ ApiSchema78, ApiSchema79, - ApiSchema710 + ApiSchema710, ] CurrentSchema = all_schemas[-1] From fbf9c5bd8c12a30eaf9b7083f39393317213dc87 Mon Sep 17 00:00:00 2001 From: Ross Wolf <31489089+rw-access@users.noreply.github.com> Date: Tue, 15 Sep 2020 16:29:27 -0600 Subject: [PATCH 3/7] Rename get_schema_from_eql --- detection_rules/rule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detection_rules/rule.py b/detection_rules/rule.py index 818437c604d..ea4f35fc56f 100644 --- a/detection_rules/rule.py +++ b/detection_rules/rule.py @@ -169,7 +169,7 @@ def _validate_eql(ecs_versions, indexes, query, name): # validate against all specified schemas or the latest if none specified parsed = eql.parse_query(query) beat_types = [index.split("-")[0] for index in indexes if "beat-*" in index] - beat_schema = beats.get_schema_from_kql(parsed, beat_types) if beat_types else None + beat_schema = beats.get_schema_from_eql(parsed, beat_types) if beat_types else None ecs_versions = ecs_versions or [ecs_versions] schemas = [] From 3e16791de924c4f7b59048312b1970fb7ca8f346 Mon Sep 17 00:00:00 2001 From: Ross Wolf <31489089+rw-access@users.noreply.github.com> Date: Tue, 15 Sep 2020 16:58:36 -0600 Subject: [PATCH 4/7] Add EQL default language --- detection_rules/schemas/v7_10.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detection_rules/schemas/v7_10.py b/detection_rules/schemas/v7_10.py index 6ab0b14bddc..c2bc2c1372f 100644 --- a/detection_rules/schemas/v7_10.py +++ b/detection_rules/schemas/v7_10.py @@ -29,7 +29,7 @@ class ApiSchema710(ApiSchema79): with jsl.Scope(EQL) as eql_scope: eql_scope.index = jsl.ArrayField(jsl.StringField(), required=False) eql_scope.query = jsl.StringField(required=True) - eql_scope.language = jsl.StringField(enum=[EQL], required=True) + eql_scope.language = jsl.StringField(enum=[EQL], required=True, default=EQL) eql_scope.type = jsl.StringField(enum=[EQL], required=True) with jsl.Scope(jsl.DEFAULT_ROLE) as default_scope: From d5db8fb8bc4c30baefdb35b057462d15eeb5ac13 Mon Sep 17 00:00:00 2001 From: Ross Wolf <31489089+rw-access@users.noreply.github.com> Date: Tue, 15 Sep 2020 17:01:28 -0600 Subject: [PATCH 5/7] Rename parsed_kql to parsed_query --- detection_rules/rule.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/detection_rules/rule.py b/detection_rules/rule.py index ea4f35fc56f..7ff68ff03d9 100644 --- a/detection_rules/rule.py +++ b/detection_rules/rule.py @@ -71,9 +71,12 @@ def query(self): return self.contents.get('query') @property - def parsed_kql(self): - if self.query and self.contents['language'] == 'kuery': - return kql.parse(self.query) + def parsed_query(self): + if self.query: + if self.contents['language'] == 'kuery': + return kql.parse(self.query) + elif self.contents['language'] == 'eql': + return eql.parse_query(self.query) @property def filters(self): From 9d96ae9e7344ea21ccdc1fbbbba457160ed84e7b Mon Sep 17 00:00:00 2001 From: Ross Wolf <31489089+rw-access@users.noreply.github.com> Date: Tue, 15 Sep 2020 17:07:53 -0600 Subject: [PATCH 6/7] Fix parsed_kql method call in loader --- detection_rules/rule_loader.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/detection_rules/rule_loader.py b/detection_rules/rule_loader.py index d52d012ce87..e309f37bb21 100644 --- a/detection_rules/rule_loader.py +++ b/detection_rules/rule_loader.py @@ -93,12 +93,13 @@ def load_rules(file_lookup=None, verbose=True, error=True): raise KeyError("Rule has duplicate name to {}".format( next(r for r in rules if r.name == rule.name).path)) - if rule.parsed_kql: - if rule.parsed_kql in queries: + parsed_query = rule.parsed_query + if parsed_query is not None: + if parsed_query in queries: raise KeyError("Rule has duplicate query with {}".format( - next(r for r in rules if r.parsed_kql == rule.parsed_kql).path)) + next(r for r in rules if r.parsed_query == parsed_query).path)) - queries.append(rule.parsed_kql) + queries.append(parsed_query) if not re.match(FILE_PATTERN, os.path.basename(rule.path)): raise ValueError(f"Rule {rule.path} does not meet rule name standard of {FILE_PATTERN}") From 192aa10d8770ae69b578fa1c545eb69c9768d821 Mon Sep 17 00:00:00 2001 From: Ross Wolf <31489089+rw-access@users.noreply.github.com> Date: Tue, 15 Sep 2020 17:14:43 -0600 Subject: [PATCH 7/7] Autopopulate dependent values --- detection_rules/misc.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/detection_rules/misc.py b/detection_rules/misc.py index 768df012168..d3a7f3af901 100644 --- a/detection_rules/misc.py +++ b/detection_rules/misc.py @@ -75,6 +75,9 @@ def schema_prompt(name, value=None, required=False, **options): if name == 'rule_id': default = str(uuid.uuid4()) + if len(enum) == 1 and required and field_type != "array": + return enum[0] + def _check_type(_val): if field_type in ('number', 'integer') and not str(_val).isdigit(): print('Number expected but got: {}'.format(_val))