Skip to content

Commit a3f679b

Browse files
qkaisere3krisztian
andcommitted
feat(processing): pattern auto-identification
Integrate pattern recognition for unknown chunks in order to help identifying parts. Here we simply detect padding, but this could be extended in the future to detect re-occuring patterns, encrypted content, or even fingerprints. Co-authored-by: Krisztián Fekete <[email protected]>
1 parent e736358 commit a3f679b

File tree

12 files changed

+71
-6
lines changed

12 files changed

+71
-6
lines changed

tests/test_report.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
ChunkReport,
1414
FileMagicReport,
1515
HashReport,
16+
PaddingChunkReport,
1617
StatReport,
1718
UnknownChunkReport,
1819
)
@@ -133,7 +134,7 @@ def hello_kitty_task_results(
133134
size=7,
134135
entropy=None,
135136
),
136-
UnknownChunkReport(
137+
PaddingChunkReport(
137138
id=ANY,
138139
start_offset=263,
139140
end_offset=264,

unblob/extractor.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
import errno
33
import os
44
from pathlib import Path
5+
from typing import Union
56

67
from structlog import get_logger
78

89
from .file_utils import carve, is_safe_path
9-
from .models import Chunk, File, TaskResult, UnknownChunk, ValidChunk
10+
from .models import Chunk, File, PaddingChunk, TaskResult, UnknownChunk, ValidChunk
1011
from .report import MaliciousSymlinkRemoved
1112

1213
logger = get_logger()
@@ -113,8 +114,14 @@ def _fix_extracted_directory(directory: Path):
113114
_fix_extracted_directory(outdir)
114115

115116

116-
def carve_unknown_chunk(extract_dir: Path, file: File, chunk: UnknownChunk) -> Path:
117-
filename = f"{chunk.start_offset}-{chunk.end_offset}.unknown"
117+
def carve_unknown_chunk(
118+
extract_dir: Path, file: File, chunk: Union[UnknownChunk, PaddingChunk]
119+
) -> Path:
120+
extension = "unknown"
121+
if isinstance(chunk, PaddingChunk):
122+
extension = "padding"
123+
124+
filename = f"{chunk.start_offset}-{chunk.end_offset}.{extension}"
118125
carve_path = extract_dir / filename
119126
logger.info("Extracting unknown chunk", path=carve_path, chunk=chunk)
120127
carve_chunk_to_file(carve_path, file, chunk)

unblob/models.py

+22
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
EntropyReport,
1818
ErrorReport,
1919
MultiFileReport,
20+
PaddingChunkReport,
2021
Report,
2122
UnknownChunkReport,
2223
)
@@ -147,6 +148,27 @@ def as_report(self, entropy: Optional[EntropyReport]) -> UnknownChunkReport:
147148
)
148149

149150

151+
@attr.define(repr=False)
152+
class PaddingChunk(Chunk):
153+
r"""Gaps between valid chunks or otherwise unknown chunks.
154+
155+
Important for manual analysis, and analytical certanity: for example
156+
entropy, other chunks inside it, metadata, etc.
157+
158+
These are not extracted, just logged for information purposes and further analysis,
159+
like most common bytes (like \x00 and \xFF), ASCII strings, high entropy, etc.
160+
"""
161+
162+
def as_report(self, entropy: Optional[EntropyReport]) -> PaddingChunkReport:
163+
return PaddingChunkReport(
164+
id=self.id,
165+
start_offset=self.start_offset,
166+
end_offset=self.end_offset,
167+
size=self.size,
168+
entropy=entropy,
169+
)
170+
171+
150172
@attrs.define
151173
class MultiFile(Blob):
152174
name: str = attr.field(kw_only=True)

unblob/processing.py

+27-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import shutil
33
from operator import attrgetter
44
from pathlib import Path
5-
from typing import Iterable, List, Optional, Sequence, Set, Tuple, Type
5+
from typing import Iterable, List, Optional, Sequence, Set, Tuple, Type, Union
66

77
import attr
88
import magic
@@ -24,6 +24,7 @@
2424
ExtractError,
2525
File,
2626
MultiFile,
27+
PaddingChunk,
2728
ProcessResult,
2829
Task,
2930
TaskResult,
@@ -450,6 +451,29 @@ def _iterate_directory(self, extract_dirs, processed_paths):
450451
)
451452

452453

454+
def is_padding(file: File, chunk: UnknownChunk):
455+
return len(set(file[chunk.start_offset : chunk.end_offset])) == 1
456+
457+
458+
def process_patterns(
459+
unknown_chunks: List[UnknownChunk], file: File
460+
) -> List[Union[UnknownChunk, PaddingChunk]]:
461+
processed_chunks = []
462+
for unknown_chunk in unknown_chunks:
463+
if is_padding(file, unknown_chunk):
464+
processed_chunks.append(
465+
PaddingChunk(
466+
start_offset=unknown_chunk.start_offset,
467+
end_offset=unknown_chunk.end_offset,
468+
id=unknown_chunk.id,
469+
file=unknown_chunk.file,
470+
)
471+
)
472+
else:
473+
processed_chunks.append(unknown_chunk)
474+
return processed_chunks
475+
476+
453477
class _FileTask:
454478
def __init__(
455479
self,
@@ -487,6 +511,7 @@ def process(self):
487511
)
488512
outer_chunks = remove_inner_chunks(all_chunks)
489513
unknown_chunks = calculate_unknown_chunks(outer_chunks, self.size)
514+
unknown_chunks = process_patterns(unknown_chunks, file)
490515
assign_file_to_chunks(outer_chunks, file=file)
491516
assign_file_to_chunks(unknown_chunks, file=file)
492517

@@ -503,7 +528,7 @@ def _process_chunks(
503528
self,
504529
file: File,
505530
outer_chunks: List[ValidChunk],
506-
unknown_chunks: List[UnknownChunk],
531+
unknown_chunks: List[Union[UnknownChunk, PaddingChunk]],
507532
):
508533
if unknown_chunks:
509534
logger.warning("Found unknown Chunks", chunks=unknown_chunks)

unblob/report.py

+10
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,16 @@ class UnknownChunkReport(Report):
236236
entropy: Optional[EntropyReport]
237237

238238

239+
@final
240+
@attr.define(kw_only=True, frozen=True)
241+
class PaddingChunkReport(Report):
242+
id: str # noqa: A003
243+
start_offset: int
244+
end_offset: int
245+
size: int
246+
entropy: Optional[EntropyReport]
247+
248+
239249
@final
240250
@attr.define(kw_only=True, frozen=True)
241251
class MultiFileReport(Report):

0 commit comments

Comments
 (0)