Split large shard sizing into two challenges and added additional id types (elastic#24)

cdahlqvist · web-flow · commit 574d168850b3 · 2018-08-21T12:55:55.000+01:00
* Split large shard sizing into two challenges and added additional id types

* Updated README.md file for revised large shard sizing tests

* Updated following review.

* Updated to use default Rally refresh operation
diff --git a/README.md b/README.md
@@ -111,14 +111,13 @@ $ cat params-file.json
 
 ### 8) large-shard-sizing
 
-This challenge examines the performance and memory usage of large shards. It indexes data into a single shard index ~25GB at a time and runs up to a shard size of ~300GB. After every 25GB that has been indexed, select index statistics are recorded and a number of simulated Kibana dashboards are run against the index. Two indices are created and benchmarked, one with document IDs generated by Elasticsearch and one with application generated UUIDs used as document IDs.
+This challenge examines the performance and memory usage of large shards. It indexes data into a single shard index ~25GB at a time and runs up to a shard size of ~300GB. After every 25GB that has been indexed, select index statistics are recorded and a number of simulated Kibana dashboards are run against the index to show how query performance varies with shard size.
 
 This challenge will show the following:
-* How index performance varies with shard size for autogenerated IDs and UUIDs
 * How dashboard query performance varies with shard size
 * How memory usage varies with shard size
 
-Note that this challenge will generate up to ~600GB of data on disk and will require additional space for merging and overhead. Make sure around 1TB of disk space is available before running this to be on the safe side.
+Note that this challenge will generate up to ~300GB of data on disk and will require additional space for merging and overhead. Make sure around 600GB of disk space is available before running this to be on the safe side.
 
 The table below shows the track parameters that can be adjusted along with default values:
 
@@ -127,6 +126,19 @@ The table below shows the track parameters that can be adjusted along with defau
 | `bulk_indexing_clients` | Number of bulk indexing clients/connections | `int` | `32` |
 | `query_iterations` | Number of times each dashboard is simulated at each level | `int` | `10` |
 
+### 9) large-shard-id-type-evaluation
+
+This challenge examines the storage and heap usage implications of a wide variety of document ID types. It indexes data into a set of ~25GB single shard index, each for a different type of document ID (`auto`, `uuid`, `epoch_uuid`, `sha1`, `sha256`, `sha384`, and `sha512`). For each index a refresh is then run before select index statistics are recorded.
+
+
+Note that this challenge will generate up to ~200GB of data on disk and will require additional space for merging and overhead. Make sure around 300GB of disk space is available before running this to be on the safe side.
+
+The table below shows the track parameters that can be adjusted along with default values:
+
+| Parameter | Explanation | Type | Default Value |
+| --------- | ----------- | ---- | ------------- |
+| `bulk_indexing_clients` | Number of bulk indexing clients/connections | `int` | `32` |
+
 ## Custom parameter sources
 
 ### elasticlogs\_bulk\_source
diff --git a/eventdata/challenges/large-shard-sizing.json b/eventdata/challenges/large-shard-sizing.json
@@ -5,7 +5,7 @@
 
 {
   "name": "large-shard-sizing",
-  "description": "Index data into a single shard ~25Gb at a time (up to a total of ~300GB), then record index statistics and run a number of queries against the shard. IDs are based on UUIDs or autogenerated by Elasticsearch, meaning there are no conflicts.",
+  "description": "Index data into a single shard ~25Gb at a time (up to a total of ~300GB), then record index statistics and run a number of queries against the shard. IDs are autogenerated by Elasticsearch, meaning there are no conflicts.",
   "meta": {
     "client_count": {{ p_bulk_indexing_clients }},
     "benchmark_type": "large-shard-sizing",
@@ -40,34 +40,6 @@
         "index_template_name": "elasticlogs-auto"
       }
     },
-    {  
-      "name": "deleteindex-elasticlogs-uuid",
-      "operation": {
-        "operation-type": "delete-index",
-        "index": "elasticlogs-uuid"
-      }
-    },
-    {
-      "name": "createindex-elasticlogs-uuid",
-      "operation": {
-        "operation-type": "createindex",
-        "index_name": "elasticlogs-uuid",
-        "index_template_body": {
-          "template": "elasticlogs-uuid",
-          "settings": {
-            "index.refresh_interval": "5s",
-            "index.codec": "best_compression",
-            "index.number_of_replicas": 0,
-            "index.number_of_shards": 1
-          },
-          "mappings": 
-            {% include "mappings.json" %}
-          ,
-          "aliases": {}
-        },
-        "index_template_name": "elasticlogs-uuid"
-      }
-    },
     {% for p_multiple in range(1, 13) %}
     {% set p_size = p_multiple * 25 %}
     {
@@ -76,7 +48,8 @@
         "operation-type": "bulk",
         "param-source": "elasticlogs_bulk",
         "index": "elasticlogs-auto",
-        "bulk-size": 1000
+        "bulk-size": 1000,
+        "id_type": "auto"
       },
       "iterations": {{ p_ops_per_client }},
       "clients": {{ p_bulk_indexing_clients }},
@@ -87,11 +60,7 @@
     },
     {
       "name": "refresh-auto-{{ p_size }}",
-      "operation": {
-        "operation-type": "raw-request",
-        "method": "POST",
-        "path": "/elasticlogs-auto/_refresh"
-      },
+      "operation": "refresh",
       "iterations": 1,
       "clients": 1
     },
@@ -103,7 +72,7 @@
       },
       "meta": {
         "id_mode": "auto",
-        "shard_size": {{ p_size }}  
+        "shard_size": {{ p_size }}
       }
     },
     {
@@ -181,128 +150,90 @@
       }
     },
     {% endfor %}
-    {% for p_multiple in range(1, 13) %}
-    {% set p_size = p_multiple * 25 %}
     {
-      "name": "index-append-1000-uuid-{{ p_size }}",
+      "name": "refresh-final",
+      "operation": "refresh",
+      "iterations": 1,
+      "clients": 1
+    }
+  ]
+},
+{
+  "name": "large-shard-id-type-evaluation",
+  "description": "Index data into a number of ~25Gb single shard indices with different document ID types, then record index statistics to allow size and memory usage comparisons. IDs are based on UUIDs or autogenerated by Elasticsearch, meaning there are no conflicts.",
+  "meta": {
+    "client_count": {{ p_bulk_indexing_clients }},
+    "benchmark_type": "large-shard-sizing",
+    "version": 2
+  },
+  "schedule": [
+    {% for id_type in ['auto', 'uuid', 'epoch_uuid', 'sha1', 'sha256', 'sha384', 'sha512'] %}
+    {  
+      "name": "deleteindex-elasticlogs-{{ id_type }}",
       "operation": {
-        "operation-type": "bulk",
-        "param-source": "elasticlogs_bulk",
-        "index": "elasticlogs-uuid",
-        "bulk-size": 1000
-      },
-      "iterations": {{ p_ops_per_client }},
-      "clients": {{ p_bulk_indexing_clients }},
-      "meta": {
-        "id_mode": "uuid",
-        "shard_size": {{ p_size }}
+        "operation-type": "delete-index",
+        "index": "elasticlogs-{{ id_type }}"
       }
     },
     {
-      "name": "refresh-uuid-{{ p_size }}",
+      "name": "createindex-elasticlogs-{{ id_type }}",
       "operation": {
-        "operation-type": "raw-request",
-        "method": "POST",
-        "path": "/elasticlogs-uuid/_refresh"
-      },
-      "iterations": 1,
-      "clients": 1
+        "operation-type": "createindex",
+        "index_name": "elasticlogs-{{ id_type }}",
+        "index_template_body": {
+          "template": "elasticlogs-{{ id_type }}",
+          "settings": {
+            "index.refresh_interval": "5s",
+            "index.codec": "best_compression",
+            "index.number_of_replicas": 0,
+            "index.number_of_shards": 1
+          },
+          "mappings": 
+            {% include "mappings.json" %}
+          ,
+          "aliases": {}
+        },
+        "index_template_name": "elasticlogs-{{ id_type }}"
+      }
     },
     {
-      "name": "indicesstats-elasticlogs-uuid-{{ p_size }}",
+      "name": "index-append-1000-{{ id_type }}",
       "operation": {
-        "operation-type": "indicesstats",
-        "index_pattern": "elasticlogs-uuid"
+        "operation-type": "bulk",
+        "param-source": "elasticlogs_bulk",
+        "index": "elasticlogs-{{ id_type }}",
+        "bulk-size": 1000,
+        "id_type": "{{ id_type }}"
       },
+      "iterations": {{ p_ops_per_client }},
+      "clients": {{ p_bulk_indexing_clients }},
       "meta": {
-        "id_mode": "uuid",
-        "shard_size": {{ p_size }}
+        "id_mode": "{{ id_type }}"
       }
     },
     {
-      "name": "fieldstats-elasticlogs-uuid-{{ p_size }}",
-      "operation": {
-        "operation-type": "fieldstats",
-        "index_pattern": "elasticlogs-uuid"
-      },
-      "warmup-iterations": 1,
-      "iterations": 1,
-      "clients": {{ p_bulk_indexing_clients }}
-    },
-    {
-      "name": "clear-caches-uuid-{{ p_size }}",
-      "operation": {
-        "operation-type": "raw-request",
-        "method": "POST",
-        "path": "/_cache/clear"
-      },
+      "name": "refresh-{{ id_type }}",
+      "operation": "refresh",
       "iterations": 1,
       "clients": 1
     },
-    { 
-      "name": "kibana-content_issues-50%-uuid-{{ p_size }}",     
-      "operation": {
-        "operation-type": "kibana",
-        "param-source": "elasticlogs_kibana",
-        "dashboard": "content_issues",
-        "index_pattern": "elasticlogs-uuid",
-        "query_string": ["*"],
-        "window_end": "START+50%,END",
-        "window_length": "50%"
-      },
-      "iterations": {{ p_query_iterations }},
-      "clients": 1,
-      "meta": {
-        "id_mode": "uuid",
-        "shard_size": {{ p_size }}
-      }
-    },
-    { 
-      "name": "kibana-traffic-25%-uuid-{{ p_size }}",     
-      "operation": {
-        "operation-type": "kibana",
-        "param-source": "elasticlogs_kibana",
-        "dashboard": "traffic",
-        "index_pattern": "elasticlogs-uuid",
-        "query_string": ["*"],
-        "window_end": "START+25%,END",
-        "window_length": "25%"
-      },
-      "iterations": {{ p_query_iterations }},
-      "clients": 1,
-      "meta": {
-        "id_mode": "uuid",
-        "shard_size": {{ p_size }}
-      }
-    },
-    { 
-      "name": "kibana-discover-50%-uuid-{{ p_size }}",     
+    {
+      "name": "indicesstats-elasticlogs-{{ id_type }}",
       "operation": {
-        "operation-type": "kibana",
-        "param-source": "elasticlogs_kibana",
-        "dashboard": "discover",
-        "index_pattern": "elasticlogs-uuid",
-        "query_string": ["*"],
-        "window_end": "START+50%,END",
-        "window_length": "50%"
+        "operation-type": "indicesstats",
+        "index_pattern": "elasticlogs-{{ id_type }}"
       },
-      "iterations": {{ p_query_iterations }},
-      "clients": 1,
       "meta": {
-        "id_mode": "uuid",
-        "shard_size": {{ p_size }}
+        "id_mode": "{{ id_type }}"
       }
     },
-    {% endfor %}
+    {% endfor %}  
     {
       "name": "refresh-final",
-      "operation": {
-        "operation-type": "raw-request",
-        "method": "POST",
-        "path": "/elasticlogs-*/_refresh"
-      },
+      "operation": "refresh",
       "iterations": 1,
       "clients": 1
     }
   ]
 }
+
diff --git a/eventdata/parameter_sources/elasticlogs_bulk_source.py b/eventdata/parameter_sources/elasticlogs_bulk_source.py
@@ -2,6 +2,7 @@
 import random
 import uuid
 import time
+import hashlib
 from eventdata.parameter_sources.randomevent import RandomEvent
 
 logger = logging.getLogger("track.eventdata")
@@ -50,6 +51,10 @@ class ElasticlogsBulkSource:
                                         uuid         - Assign a UUID4 id to each document.
                                         epoch_uuid   - Assign a UUIO4 identifier prefixed with the hex representation of the current 
                                                        timestamp.
+                                        sha1         - SHA1 hash of UUID in hex representation. (Note: Generating this type of id can be CPU intensive)
+                                        sha256       - SHA256 hash of UUID in hex representation. (Note: Generating this type of id can be CPU intensive)
+                                        sha384       - SHA384 hash of UUID in hex representation. (Note: Generating this type of id can be CPU intensive)
+                                        sha512       - SHA512 hash of UUID in hex representation. (Note: Generating this type of id can be CPU intensive)
         "id_delay_probability" -    If id_type is set to `epoch_uuid` this parameter determnines the probability will be set in the 
                                     past. This can be used to simulate a portion of the events arriving delayed. Must be in range [0.0, 1.0].
                                     Defaults to 0.0.
@@ -67,7 +72,7 @@ def __init__(self, track, params, **kwargs):
 
         self._id_type = "auto"
         if 'id_type' in params.keys():
-            if params['id_type'] in ['auto', 'uuid', 'epoch_uuid']:
+            if params['id_type'] in ['auto', 'uuid', 'epoch_uuid', 'sha1', 'sha256', 'sha384', 'sha512']:
                 self._id_type = params['id_type']
             else:
                 logger.warning("[bulk] Invalid id_type ({}) specified. Will use default.".format(params['id_type']))
@@ -115,6 +120,14 @@ def params(self):
             else:
                 if self._id_type == 'uuid':
                     docid = self.__get_uuid()
+                elif self._id_type == 'sha1':
+                    docid = hashlib.sha1(self.__get_uuid().encode()).hexdigest()
+                elif self._id_type == 'sha256':
+                    docid = hashlib.sha256(self.__get_uuid().encode()).hexdigest()
+                elif self._id_type == 'sha384':
+                    docid = hashlib.sha384(self.__get_uuid().encode()).hexdigest()
+                elif self._id_type == 'sha512':
+                    docid = hashlib.sha512(self.__get_uuid().encode()).hexdigest()
                 else:
                     docid = self.__get_epoch_uuid()