Skip to content

Add LZ77 compression algorithm #8059

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Dec 28, 2022
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 43 additions & 35 deletions compression/lz77.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,20 @@
__version__ = "0.1"
__author__ = "Lucia Harcekova"

from typing import List


class Token:
"""
Dataclass representing triplet called token consisting of length, offset
and indicator. This triplet is used during LZ77 compression.
"""

def __init__(self, offset: int, length: int, indicator: str) -> None:
self.offset = offset
self.length = length
self.indicator = indicator


class LZ77Compressor:
"""
Expand All @@ -43,21 +57,14 @@ def __init__(self, window_size: int = 13, lookahead_buffer_size: int = 6) -> Non
self.lookahead_buffer_size = lookahead_buffer_size
self.search_buffer_size = self.window_size - self.lookahead_buffer_size

def compress(self, text: str) -> list:
def compress(self, text: str) -> List[Token]:
"""This method compresses given string text using LZ77 compression algorithm.

Args:
text (str): string that's going to be compressed

Returns:
output (list): the compressed text

Tests:
>>> lz77_compressor = LZ77Compressor(13, 6)
>>> lz77_compressor.compress("ababcbababaa")
[(0, 0, 'a'), (0, 0, 'b'), (2, 2, 'c'), (4, 3, 'a'), (2, 2, 'a')]
>>> lz77_compressor.compress("aacaacabcabaaac")
[(0, 0, 'a'), (1, 1, 'c'), (3, 4, 'b'), (3, 3, 'a'), (1, 2, 'c')]
output (List[Token]): the compressed text
"""

output = []
Expand All @@ -68,79 +75,80 @@ def compress(self, text: str) -> list:

# find the next encoding phrase
# - triplet with offset, length, indicator (the next encoding character)
(offset, length, indicator) = self._find_encoding_token(text, search_buffer)
token = self._find_encoding_token(text, search_buffer)

# update the search buffer:
# - add new characters from text into it
# - check if size exceed the max search buffer size, if so, drop the
# oldest elements
search_buffer += text[: length + 1]
search_buffer += text[: token.length + 1]
if len(search_buffer) > self.search_buffer_size:
search_buffer = search_buffer[-self.search_buffer_size :]

# update the text
text = text[length + 1 :]
text = text[token.length + 1 :]

# append the token to output
output.append((offset, length, indicator))
output.append(token)

return output

def decompress(self, tokens: list) -> str:
"""This method turns the list of tokens consisting of triplets of the form
def decompress(self, tokens: List[Token]) -> str:
"""This method turns the List of tokens consisting of triplets of the form
(offset, length, char), into an output string.

Args:
tokens (list): Tokens (offset, length, char)
tokens (List[Token]): Tokens (offset, length, char)

Returns:
output (str): The decompressed text

Tests:
>>> lz77_compressor = LZ77Compressor(13, 6)
>>> lz77_compressor.decompress([(0, 0, 'c'), (0, 0, 'a'), (0, 0, 'b'), \
(0, 0, 'r'), (3, 1, 'c'), (2, 1, 'd'), (7, 4, 'r'), (3, 5, 'd')])
>>> lz77_compressor.decompress([Token(0, 0, 'c'), Token(0, 0, 'a'), \
Token(0, 0, 'b'), Token(0, 0, 'r'), Token(3, 1, 'c'), \
Token(2, 1, 'd'), Token(7, 4, 'r'), Token(3, 5, 'd')])
'cabracadabrarrarrad'
>>> lz77_compressor.decompress([(0, 0, 'a'), (0, 0, 'b'), (2, 2, 'c'), \
(4, 3, 'a'), (2, 2, 'a')])
>>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(0, 0, 'b'), \
Token(2, 2, 'c'), Token(4, 3, 'a'), Token(2, 2, 'a')])
'ababcbababaa'
>>> lz77_compressor.decompress([(0, 0, 'a'), (1, 1, 'c'), (3, 4, 'b'), \
(3, 3, 'a'), (1, 2, 'c')])
>>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(1, 1, 'c'), \
Token(3, 4, 'b'), Token(3, 3, 'a'), Token(1, 2, 'c')])
'aacaacabcabaaac'
"""

output = ""

for (offset, length, indicator) in tokens:
for _ in range(length):
output += output[-offset]
output += indicator
for token in tokens:
for _ in range(token.length):
output += output[-token.offset]
output += token.indicator

return output

def _find_encoding_token(self, text: str, search_buffer: str) -> tuple:
def _find_encoding_token(self, text: str, search_buffer: str) -> Token:
"""Finds the encoding token for the first character in the text.

Args:
text (str)
search_buffer (str)

Returns:
tuple: Token
(offset, length, indicator) (Token)

Tests:
>>> lz77_compressor = LZ77Compressor(13, 6)
>>> lz77_compressor._find_encoding_token("abrarrarrad", "abracad")
(7, 4, 'r')
>>> lz77_compressor._find_encoding_token("adabrarrarrad", "cabrac")
(2, 1, 'd')
>>> lz77_compressor._find_encoding_token("abrarrarrad", "abracad").offset
7
>>> lz77_compressor._find_encoding_token("adabrarrarrad", "cabrac").length
1
"""

# Initialise result parameters to default values
length, offset = 0, 0

if search_buffer == "":
return offset, length, text[length]
return Token(offset, length, text[length])

for i, character in enumerate(search_buffer):
found_offset = len(search_buffer) - i
Expand All @@ -151,7 +159,7 @@ def _find_encoding_token(self, text: str, search_buffer: str) -> tuple:
if found_length >= length:
offset, length = found_offset, found_length

return offset, length, text[length]
return Token(offset, length, text[length])

def _match_length_from_index(
self, text: str, window: str, text_index: int, window_index: int
Expand Down Expand Up @@ -192,4 +200,4 @@ def _match_length_from_index(
TEXT = "cabracadabrarrarrad"
compressed_text = lz77_compressor.compress(TEXT)
decompressed_text = lz77_compressor.decompress(compressed_text)
assert decompressed_text == TEXT, "The LZ77 agirithm returned the invalid result."
assert decompressed_text == TEXT, "The LZ77 algorithm returned the invalid result."