Home Articles Categories Series
Pythonise Just now

Python Run-length encoder | Lossless string compression

Building a Run-length encoder with Python for lossless string compression


Article Posted on by in Python
Julian Nash · 7 months ago in Python

Run-length encoding is a simple form of lessless compression that takes an input string and returns a string comprising of a sequence of numbers and characters, with each number representing the number of occurances of the character following it.

For example, an input string of:

abc123xyzaaabbbcccd

Will be compressed to:

1a1b1c1112131x1y1z3a3b3c1d

The 3 methods in the RunLengthEncoder return the following sequences:

  • encode_a - Returns a sequence of occurances followed by characters, however will not return the occurances of a character if said characted only appears once. It will instead just return the character itself.

For example:

abc123xyzaaabbbcccd

Will return:

abc123xyz3a3b3cd
  • encode_b - Returns a sequence of occurances and characters, regardless of how many times they appear.

  • encode_c - Returns the same result as encode_b but uses a list comprehension to build the return string.

Code

run_length_encoder.py

class RunLengthEncoder:

    """
    class containing 3 methods that can be used to return a run-length
    encoded string.
    """

    def encode_a(self, text):

        """
        Returns a run-length encoded string from an input string.
        Note: This function will not return the character count in the return
        string if only a single instance of the character is found.

        Args:
            text (str): A string to encode

        Returns:
            str: A run length encoded string

        Example:
            input: "aaabbcdddd"
            returns: "3a2bc4d"
        """

        count = 1
        previous = ""
        mapping = list()

        for character in text:
            if character != previous:
                if previous:
                    mapping.append((previous, count))
                count = 1
                previous = character
            else:
                count += 1
        else:
            mapping.append((character, count))

        result = ""

        for character, count in mapping:
            if count == 1:
                result += character
            else:
                result += str(count)
                result += character

        return result

    def encode_b(self, text):

        """
        Returns a run-length encoded string from an input string.

        Args:
            text (str): A string to encode

        Returns:
            str: A run length encoded string

        Example:
            input: "aaabbcdddd"
            returns: "3a2b1c4d"
        """

        count = 1
        previous = ""
        mapping = list()

        for character in text:
            if character != previous:
                if previous:
                    mapping.append((previous, count))
                count = 1
                previous = character
            else:
                count += 1
        else:
            mapping.append((character, count))

        result = ""

        for character, count in mapping:
            result += str(count)
            result += character

        return result

    def encode_c(self, text):

        """
        Returns a run-length encoded string from an input string.
        This method uses a list comprehension to build the return
        string.

        Args:
            text (str): A string to encode

        Returns:
            str: A run length encoded string

        Example:
            input: "aaabbcdddd"
            returns: "3a2b1c4d"
        """

        count = 1
        previous = ""
        mapping = list()

        for character in text:
            if character != previous:
                if previous:
                    mapping.append((previous, count))
                count = 1
                previous = character
            else:
                count += 1
        else:
            mapping.append((character, count))

        result = "".join(f"{str(count)}{character}" for character, count in mapping)

        return result

Tests

tests.py

import unittest
from run_length_encoder import RunLengthEncoder


class TestRunLengthEncoders(unittest.TestCase):

    # Tests for encoder_a
    def test_encoder_a(self):
        encoder = RunLengthEncoder()
        text = "aaabbcdddd"
        expected_result = "3a2bc4d"
        self.assertEqual(encoder.encode_a(text), expected_result)

    def test_encoder_a2(self):
        encoder = RunLengthEncoder()
        text = "aaabbbcccdddeeefffggghhhiiijjjkkklllmmmnnnoooppp"
        expected_result = "3a3b3c3d3e3f3g3h3i3j3k3l3m3n3o3p"
        self.assertEqual(encoder.encode_a(text), expected_result)

    def test_encoder_a3(self):
        encoder = RunLengthEncoder()
        text = "abcabcabcabcabcabc"
        expected_result = "abcabcabcabcabcabc"
        self.assertEqual(encoder.encode_a(text), expected_result)

    def test_encoder_a4(self):
        encoder = RunLengthEncoder()
        text = "1223334444"
        expected_result = "1223344"
        self.assertEqual(encoder.encode_a(text), expected_result)

    def test_encoder_a5(self):
        encoder = RunLengthEncoder()
        text = "abc123xyz"
        expected_result = "abc123xyz"
        self.assertEqual(encoder.encode_a(text), expected_result)

    # Tests for encoder_b
    def test_encoder_b(self):
        encoder = RunLengthEncoder()
        text = "aaabbcdddd"
        expected_result = "3a2b1c4d"
        self.assertEqual(encoder.encode_b(text), expected_result)

    def test_encoder_b2(self):
        encoder = RunLengthEncoder()
        text = "aaabbbcccdddeeefffggghhhiiijjjkkklllmmmnnnoooppp"
        expected_result = "3a3b3c3d3e3f3g3h3i3j3k3l3m3n3o3p"
        self.assertEqual(encoder.encode_b(text), expected_result)

    def test_encoder_b3(self):
        encoder = RunLengthEncoder()
        text = "abcabcabcabcabcabc"
        expected_result = "1a1b1c1a1b1c1a1b1c1a1b1c1a1b1c1a1b1c"
        self.assertEqual(encoder.encode_b(text), expected_result)

    def test_encoder_b4(self):
        encoder = RunLengthEncoder()
        text = "1223334444"
        expected_result = "11223344"
        self.assertEqual(encoder.encode_b(text), expected_result)

    def test_encoder_b5(self):
        encoder = RunLengthEncoder()
        text = "abc123xyz"
        expected_result = "1a1b1c1112131x1y1z"
        self.assertEqual(encoder.encode_b(text), expected_result)

    # Tests for encoder_c
    def test_encoder_c(self):
        encoder = RunLengthEncoder()
        text = "aaabbcdddd"
        expected_result = "3a2b1c4d"
        self.assertEqual(encoder.encode_c(text), expected_result)

    def test_encoder_c2(self):
        encoder = RunLengthEncoder()
        text = "aaabbbcccdddeeefffggghhhiiijjjkkklllmmmnnnoooppp"
        expected_result = "3a3b3c3d3e3f3g3h3i3j3k3l3m3n3o3p"
        self.assertEqual(encoder.encode_c(text), expected_result)

    def test_encoder_c3(self):
        encoder = RunLengthEncoder()
        text = "abcabcabcabcabcabc"
        expected_result = "1a1b1c1a1b1c1a1b1c1a1b1c1a1b1c1a1b1c"
        self.assertEqual(encoder.encode_c(text), expected_result)

    def test_encoder_c4(self):
        encoder = RunLengthEncoder()
        text = "1223334444"
        expected_result = "11223344"
        self.assertEqual(encoder.encode_c(text), expected_result)

    def test_encoder_c5(self):
        encoder = RunLengthEncoder()
        text = "abc123xyz"
        expected_result = "1a1b1c1112131x1y1z"
        self.assertEqual(encoder.encode_c(text), expected_result)


if __name__ == "__main__":
    unittest.main()

Example

example.py

from run_length_encoder import RunLengthEncoder

encoder = RunLengthEncoder()

string_to_compress = "abc123xyzaaabbbcccd"
print(f"Compressing string: {string_to_compress}")

result = encoder.encode_a(string_to_compress)
print(result)

result = encoder.encode_b(string_to_compress)
print(result)

result = encoder.encode_c(string_to_compress)
print(result)

Usage

To play around with the encoder, clone this repo and run the following:

python example.py

Output:

Compressing string: abc123xyzaaabbbcccd
Output a: abc123xyz3a3b3cd
Output b: 1a1b1c1112131x1y1z3a3b3c1d
Output c: 1a1b1c1112131x1y1z3a3b3c1d

Testing

To run the small suite of unit tests:

python tests.py
Last modified · 02 Apr 2019
Did you find this article useful?
This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License
Contents
Loading...