Skip to content

Commit a3d406f

Browse files
authored
Merge pull request #697 from onekey-sec/padding-auto-id
Unknown chunks auto-identification (padding)
2 parents 00b25fa + 89cd491 commit a3d406f

File tree

11 files changed

+69
-10
lines changed

11 files changed

+69
-10
lines changed

tests/test_report.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def hello_kitty_task_results(
7373
extract_root: Path,
7474
hello_id: str,
7575
kitty_id: str,
76+
padding_id: str,
7677
container_id="",
7778
start_depth=0,
7879
):
@@ -133,12 +134,14 @@ def hello_kitty_task_results(
133134
size=7,
134135
entropy=None,
135136
),
136-
UnknownChunkReport(
137-
id=ANY,
137+
ChunkReport(
138+
id=padding_id,
138139
start_offset=263,
139140
end_offset=264,
140141
size=1,
141-
entropy=None,
142+
handler_name="padding",
143+
is_encrypted=False,
144+
extraction_reports=[],
142145
),
143146
ChunkReport(
144147
id=hello_id,
@@ -286,13 +289,14 @@ def test_flat_report_structure(hello_kitty: Path, extract_root):
286289
task_results = get_normalized_task_results(process_result)
287290

288291
# extract the ids from the chunks
289-
hello_id, kitty_id = get_chunk_ids(task_results[0])
292+
padding_id, hello_id, kitty_id = get_chunk_ids(task_results[0])
290293

291294
assert task_results == hello_kitty_task_results(
292295
hello_kitty=hello_kitty,
293296
extract_root=extract_root,
294297
hello_id=hello_id,
295298
kitty_id=kitty_id,
299+
padding_id=padding_id,
296300
)
297301

298302

@@ -416,7 +420,7 @@ def test_chunk_in_chunk_report_structure(hello_kitty_container: Path, extract_ro
416420
# and they should be the only differences
417421
[main_id] = get_chunk_ids(task_results[0])
418422

419-
hello_id, kitty_id = get_chunk_ids(task_results[2])
423+
padding_id, hello_id, kitty_id = get_chunk_ids(task_results[2])
420424

421425
# We test, that the container is referenced from the internal file
422426
# through the chunk id `main_id`
@@ -428,6 +432,7 @@ def test_chunk_in_chunk_report_structure(hello_kitty_container: Path, extract_ro
428432
extract_root=extract_root / "container_extract",
429433
hello_id=hello_id,
430434
kitty_id=kitty_id,
435+
padding_id=padding_id,
431436
container_id=main_id,
432437
start_depth=1,
433438
)

unblob/extractor.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
import errno
33
import os
44
from pathlib import Path
5+
from typing import Union
56

67
from structlog import get_logger
78

89
from .file_utils import carve, is_safe_path
9-
from .models import Chunk, File, TaskResult, UnknownChunk, ValidChunk
10+
from .models import Chunk, File, PaddingChunk, TaskResult, UnknownChunk, ValidChunk
1011
from .report import MaliciousSymlinkRemoved
1112

1213
logger = get_logger()
@@ -113,8 +114,14 @@ def _fix_extracted_directory(directory: Path):
113114
_fix_extracted_directory(outdir)
114115

115116

116-
def carve_unknown_chunk(extract_dir: Path, file: File, chunk: UnknownChunk) -> Path:
117-
filename = f"{chunk.start_offset}-{chunk.end_offset}.unknown"
117+
def carve_unknown_chunk(
118+
extract_dir: Path, file: File, chunk: Union[UnknownChunk, PaddingChunk]
119+
) -> Path:
120+
extension = "unknown"
121+
if isinstance(chunk, PaddingChunk):
122+
extension = "padding"
123+
124+
filename = f"{chunk.start_offset}-{chunk.end_offset}.{extension}"
118125
carve_path = extract_dir / filename
119126
logger.info("Extracting unknown chunk", path=carve_path, chunk=chunk)
120127
carve_chunk_to_file(carve_path, file, chunk)

unblob/models.py

+22
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,28 @@ def as_report(self, entropy: Optional[EntropyReport]) -> UnknownChunkReport:
147147
)
148148

149149

150+
@attr.define(repr=False)
151+
class PaddingChunk(Chunk):
152+
r"""Gaps between valid chunks or otherwise unknown chunks.
153+
154+
Important for manual analysis, and analytical certanity: for example
155+
entropy, other chunks inside it, metadata, etc.
156+
"""
157+
158+
def as_report(
159+
self, entropy: Optional[EntropyReport] # noqa: ARG002
160+
) -> ChunkReport:
161+
return ChunkReport(
162+
id=self.id,
163+
start_offset=self.start_offset,
164+
end_offset=self.end_offset,
165+
size=self.size,
166+
is_encrypted=False,
167+
handler_name="padding",
168+
extraction_reports=[],
169+
)
170+
171+
150172
@attrs.define
151173
class MultiFile(Blob):
152174
name: str = attr.field(kw_only=True)

unblob/processing.py

+27-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import shutil
33
from operator import attrgetter
44
from pathlib import Path
5-
from typing import Iterable, List, Optional, Sequence, Set, Tuple, Type
5+
from typing import Iterable, List, Optional, Sequence, Set, Tuple, Type, Union
66

77
import attr
88
import magic
@@ -24,6 +24,7 @@
2424
ExtractError,
2525
File,
2626
MultiFile,
27+
PaddingChunk,
2728
ProcessResult,
2829
Task,
2930
TaskResult,
@@ -458,6 +459,29 @@ def _iterate_directory(self, extract_dirs, processed_paths):
458459
)
459460

460461

462+
def is_padding(file: File, chunk: UnknownChunk):
463+
return len(set(file[chunk.start_offset : chunk.end_offset])) == 1
464+
465+
466+
def process_patterns(
467+
unknown_chunks: List[UnknownChunk], file: File
468+
) -> List[Union[UnknownChunk, PaddingChunk]]:
469+
processed_chunks = []
470+
for unknown_chunk in unknown_chunks:
471+
if is_padding(file, unknown_chunk):
472+
processed_chunks.append(
473+
PaddingChunk(
474+
start_offset=unknown_chunk.start_offset,
475+
end_offset=unknown_chunk.end_offset,
476+
id=unknown_chunk.id,
477+
file=unknown_chunk.file,
478+
)
479+
)
480+
else:
481+
processed_chunks.append(unknown_chunk)
482+
return processed_chunks
483+
484+
461485
class _FileTask:
462486
def __init__(
463487
self,
@@ -495,6 +519,7 @@ def process(self):
495519
)
496520
outer_chunks = remove_inner_chunks(all_chunks)
497521
unknown_chunks = calculate_unknown_chunks(outer_chunks, self.size)
522+
unknown_chunks = process_patterns(unknown_chunks, file)
498523
assign_file_to_chunks(outer_chunks, file=file)
499524
assign_file_to_chunks(unknown_chunks, file=file)
500525

@@ -511,7 +536,7 @@ def _process_chunks(
511536
self,
512537
file: File,
513538
outer_chunks: List[ValidChunk],
514-
unknown_chunks: List[UnknownChunk],
539+
unknown_chunks: List[Union[UnknownChunk, PaddingChunk]],
515540
):
516541
if unknown_chunks:
517542
logger.warning("Found unknown Chunks", chunks=unknown_chunks)

0 commit comments

Comments
 (0)