From 55982909d2ce8b7f3cc67de3d97f1982316ecc41 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 30 Jun 2024 12:37:21 +0100 Subject: [PATCH 01/16] posts: trie: add insertion --- content/posts/2024-06-30-trie/index.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/content/posts/2024-06-30-trie/index.md b/content/posts/2024-06-30-trie/index.md index 968aa0e..b8e4679 100644 --- a/content/posts/2024-06-30-trie/index.md +++ b/content/posts/2024-06-30-trie/index.md @@ -78,3 +78,22 @@ def get(self, key: str) -> T | None: # Otherwise, recurse on the child corresponding to the first letter return self._children[key[0]].get(key[1:]) ``` + +### Insertion + +Adding a new value to the _Trie_ is similar to a key lookup, only this time we +store the new value instead of returning it. + +```python +def insert(self, key: str, value: T) -> bool: + # Have we matched the full key? + if not key: + # Check whether we're overwriting a previous mapping + was_mapped = self._value is None + # Store the corresponding value + self._value = value + # Return whether we've performed an overwrite + return was_mapped + # Otherwise, recurse on the child corresponding to the first letter + return self._children[key[0]].insert(key[1:], value) +``` From 239d5c3dbdf5cc59014971da326654d1b76fd821 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sat, 6 Jul 2024 23:33:47 +0100 Subject: [PATCH 02/16] posts: add gap-buffer --- content/posts/2024-07-06-gap-buffer/index.md | 25 ++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 content/posts/2024-07-06-gap-buffer/index.md diff --git a/content/posts/2024-07-06-gap-buffer/index.md b/content/posts/2024-07-06-gap-buffer/index.md new file mode 100644 index 0000000..43b992f --- /dev/null +++ b/content/posts/2024-07-06-gap-buffer/index.md @@ -0,0 +1,25 @@ +--- +title: "Gap Buffer" +date: 2024-07-06T21:27:19+01:00 +draft: false # I don't care for draft mode, git has branches for that +description: "As featured in GNU Emacs" +tags: +- algorithms +- data structures +- python +categories: +- programming +series: +- Cool algorithms +favorite: false +disable_feed: false +--- + +The [_Gap Buffer_][wiki] is a popular data structure for text editors to +represent files and editable buffers. The most famous of them probably being +[GNU Emacs][emacs]. + +[wiki]: https://en.wikipedia.org/wiki/Gap_buffer +[emacs]: https://www.gnu.org/software/emacs/manual/html_node/elisp/Buffer-Gap.html + + From a4976aeefb40da97f384dacbad249ea3d29e330f Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sat, 6 Jul 2024 23:34:49 +0100 Subject: [PATCH 03/16] posts: gap-buffer: add presentation --- content/posts/2024-07-06-gap-buffer/index.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/content/posts/2024-07-06-gap-buffer/index.md b/content/posts/2024-07-06-gap-buffer/index.md index 43b992f..44d699f 100644 --- a/content/posts/2024-07-06-gap-buffer/index.md +++ b/content/posts/2024-07-06-gap-buffer/index.md @@ -23,3 +23,13 @@ represent files and editable buffers. The most famous of them probably being [emacs]: https://www.gnu.org/software/emacs/manual/html_node/elisp/Buffer-Gap.html + +## What does it do? + +A _Gap Buffer_ is simply a list of characters, similar to a normal string, with +the added twist of splitting it into two side: the prefix and suffix, on either +side of the cursor. In between them, a gap is left to allow for quick +insertion at the cursor. + +Moving the cursor moves the gap around the buffer, the prefix and suffix getting +shorter/longer as required. From 091e8527e3aae666b4a03b2cef919451d84e1c68 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sat, 6 Jul 2024 23:35:39 +0100 Subject: [PATCH 04/16] posts: gap-buffer: add construction --- content/posts/2024-07-06-gap-buffer/index.md | 39 ++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/content/posts/2024-07-06-gap-buffer/index.md b/content/posts/2024-07-06-gap-buffer/index.md index 44d699f..2b02dc4 100644 --- a/content/posts/2024-07-06-gap-buffer/index.md +++ b/content/posts/2024-07-06-gap-buffer/index.md @@ -33,3 +33,42 @@ insertion at the cursor. Moving the cursor moves the gap around the buffer, the prefix and suffix getting shorter/longer as required. + +## Implementation + +I'll be writing a sample implementation in Python, as with the rest of the +[series]({{< ref "/series/cool-algorithms/">}}). I don't think it showcases the +elegance of the _Gap Buffer_ in action like a C implementation full of +`memmove`s would, but it does makes it short and sweet. + +### Representation + +We'll be representing the gap buffer as an actual list of characters. + +Given that Python doesn't _have_ characters, let's settle for a list of strings, +each representing a single character... + +```python +Char = str + +class GapBuffer: + # List of characters, contains prefix and suffix of string with gap in the middle + _buf: list[Char] + # The gap is contained between [start, end) (i.e: buf[start:end]) + _gap_start: int + _gap_end: int + + # Visual representation of the gap buffer: + # This is a very [ ]long string. + # |<----------------------------------------------->| capacity + # |<------------>| |<-------->| string + # |<------------------->| gap + # |<------------>| prefix + # |<-------->| suffix + def __init__(self, initial_capacity: int = 16) -> None: + assert initial_capacity > 0 + # Initialize an empty gap buffer + self._buf = [""] * initial_capacity + self._gap_start = 0 + self._gap_end = initial_capacity +``` From 4d69be06334bee81a10625b215d0b97c206b44c3 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sat, 6 Jul 2024 23:36:02 +0100 Subject: [PATCH 05/16] posts: gap-buffer: add accessors --- content/posts/2024-07-06-gap-buffer/index.md | 27 ++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/content/posts/2024-07-06-gap-buffer/index.md b/content/posts/2024-07-06-gap-buffer/index.md index 2b02dc4..a90e8a4 100644 --- a/content/posts/2024-07-06-gap-buffer/index.md +++ b/content/posts/2024-07-06-gap-buffer/index.md @@ -72,3 +72,30 @@ class GapBuffer: self._gap_start = 0 self._gap_end = initial_capacity ``` + +### Accessors + +I'm mostly adding these for exposition, and making it easier to write `assert`s +later. + +```python +@property +def capacity(self) -> int: + return len(self._buf) + +@property +def gap_length(self) -> int: + return self._gap_end - self._gap_start + +@property +def string_length(self) -> int: + return self.capacity - self.gap_length + +@property +def prefix_length(self) -> int: + return self._gap_start + +@property +def suffix_length(self) -> int: + return self.capacity - self._gap_end +``` From f4a64b2a37a75f81438925f0598204a052afd2f8 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 14 Jul 2024 17:53:25 +0100 Subject: [PATCH 06/16] posts: add bloom-filter --- .../posts/2024-07-14-bloom-filter/index.md | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 content/posts/2024-07-14-bloom-filter/index.md diff --git a/content/posts/2024-07-14-bloom-filter/index.md b/content/posts/2024-07-14-bloom-filter/index.md new file mode 100644 index 0000000..98cfc1e --- /dev/null +++ b/content/posts/2024-07-14-bloom-filter/index.md @@ -0,0 +1,26 @@ +--- +title: "Bloom Filter" +date: 2024-07-14T17:46:40+01:00 +draft: false # I don't care for draft mode, git has branches for that +description: "Probably cool" +tags: + - algorithms + - data structures + - python +categories: + - programming +series: +- Cool algorithms +favorite: false +disable_feed: false +--- + +The [_Bloom Filter_][wiki] is a probabilistic data structure for set membership. + +The filter can be used as an inexpensive first step when querying the actual +data is quite costly (e.g: as a first check for expensive cache lookups or large +data seeks). + +[wiki]: https://en.wikipedia.org/wiki/Bloom_filter + + From 0084c8717a062f4fb70b033c411871f18276ea79 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sat, 6 Jul 2024 23:36:20 +0100 Subject: [PATCH 07/16] posts: gap-buffer: add growth --- content/posts/2024-07-06-gap-buffer/index.md | 22 ++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/content/posts/2024-07-06-gap-buffer/index.md b/content/posts/2024-07-06-gap-buffer/index.md index a90e8a4..ace8fd9 100644 --- a/content/posts/2024-07-06-gap-buffer/index.md +++ b/content/posts/2024-07-06-gap-buffer/index.md @@ -99,3 +99,25 @@ def prefix_length(self) -> int: def suffix_length(self) -> int: return self.capacity - self._gap_end ``` + +### Growing the buffer + +I've written this method in a somewhat non-idiomatic manner, to make it closer +to how it would look in C using `realloc` instead. + +It would be more efficient to use slicing to insert the needed extra capacity +directly, instead of making a new buffer and copying characters over. + +```python +def grow(self, capacity: int) -> None: + assert capacity >= self.capacity + # Create a new buffer with the new capacity + new_buf = [""] * capacity + # Move the prefix/suffix to their place in the new buffer + added_capacity = capacity - len(self._buf) + new_buf[: self._gap_start] = self._buf[: self._gap_start] + new_buf[self._gap_end + added_capacity :] = self._buf[self._gap_end :] + # Use the new buffer, account for added capacity + self._buf = new_buf + self._gap_end += added_capacity +``` From 3992996a89dc3183c2563939cbd8de2a941cd393 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 14 Jul 2024 17:54:59 +0100 Subject: [PATCH 08/16] posts: bloom-filter: add presentation --- content/posts/2024-07-14-bloom-filter/index.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/content/posts/2024-07-14-bloom-filter/index.md b/content/posts/2024-07-14-bloom-filter/index.md index 98cfc1e..0a82882 100644 --- a/content/posts/2024-07-14-bloom-filter/index.md +++ b/content/posts/2024-07-14-bloom-filter/index.md @@ -24,3 +24,16 @@ data seeks). [wiki]: https://en.wikipedia.org/wiki/Bloom_filter + +## What does it do? + +A _Bloom Filter_ can be understood as a hash-set which can either tell you: + +* An element is _not_ part of the set. +* An element _may be_ part of the set. + +More specifically, one can tweak the parameters of the filter to make it so that +the _false positive_ rate of membership is quite low. + +I won't be going into those calculations here, but they are quite trivial to +compute, or one can just look up appropriate values for their use case. From 72057a3224cf0f537d66e698ab9db37bfd7d5044 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sat, 6 Jul 2024 23:36:33 +0100 Subject: [PATCH 09/16] posts: gap-buffer: add insertion --- content/posts/2024-07-06-gap-buffer/index.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/content/posts/2024-07-06-gap-buffer/index.md b/content/posts/2024-07-06-gap-buffer/index.md index ace8fd9..b23f21a 100644 --- a/content/posts/2024-07-06-gap-buffer/index.md +++ b/content/posts/2024-07-06-gap-buffer/index.md @@ -121,3 +121,22 @@ def grow(self, capacity: int) -> None: self._buf = new_buf self._gap_end += added_capacity ``` + +### Insertion + +Inserting text at the cursor's position means filling up the gap in the middle +of the buffer. To do so we must first make sure that the gap is big enough, or +grow the buffer accordingly. + +Then inserting the text is simply a matter of copying its characters in place, +and moving the start of the gap further right. + +```python +def insert(self, val: str) -> None: + # Ensure we have enouh space to insert the whole string + if len(val) > self.gap_length: + self.grow(max(self.capacity * 2, self.string_length + len(val))) + # Fill the gap with the given string + self._buf[self._gap_start : self._gap_start + len(val)] = val + self._gap_start += len(val) +``` From 1d37e00b3a9afe578ef4441e0b49ac375dc62a03 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 30 Jun 2024 12:37:48 +0100 Subject: [PATCH 10/16] posts: trie: add removal --- content/posts/2024-06-30-trie/index.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/content/posts/2024-06-30-trie/index.md b/content/posts/2024-06-30-trie/index.md index b8e4679..2a0d77e 100644 --- a/content/posts/2024-06-30-trie/index.md +++ b/content/posts/2024-06-30-trie/index.md @@ -97,3 +97,20 @@ def insert(self, key: str, value: T) -> bool: # Otherwise, recurse on the child corresponding to the first letter return self._children[key[0]].insert(key[1:], value) ``` + +### Removal + +Removal should also look familiar. + +```python +def remove(self, key: str) -> bool: + # Have we matched the full key? + if not key: + was_mapped = self._value is None + # Remove the value + self._value = None + # Return whether it was mapped + return was_mapped + # Otherwise, recurse on the child corresponding to the first letter + return self._children[key[0]].remove(key[1:]) +``` From 798116716f528a5a439d1bc490ec1a955d548e04 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 14 Jul 2024 17:55:15 +0100 Subject: [PATCH 11/16] posts: bloom-filter: add construction --- .../posts/2024-07-14-bloom-filter/index.md | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/content/posts/2024-07-14-bloom-filter/index.md b/content/posts/2024-07-14-bloom-filter/index.md index 0a82882..547d50f 100644 --- a/content/posts/2024-07-14-bloom-filter/index.md +++ b/content/posts/2024-07-14-bloom-filter/index.md @@ -37,3 +37,28 @@ the _false positive_ rate of membership is quite low. I won't be going into those calculations here, but they are quite trivial to compute, or one can just look up appropriate values for their use case. + +## Implementation + +I'll be using Python, which has the nifty ability of representing bitsets +through its built-in big integers quite easily. + +We'll be assuming a `BIT_COUNT` of 64 here, but the implementation can easily be +tweaked to use a different number, or even change it at construction time. + +### Representation + +A `BloomFilter` is just a set of bits and a list of hash functions. + +```python +BIT_COUNT = 64 + +class BloomFilter[T]: + _bits: int + _hash_functions: list[Callable[[T], int]] + + def __init__(self, hash_functions: list[Callable[[T], int]]) -> None: + # Filter is initially empty + self._bits = 0 + self._hash_functions = hash_functions +``` From e05ed1cc4aac43a8c266bbff31a3919b672225e9 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sat, 6 Jul 2024 23:36:46 +0100 Subject: [PATCH 12/16] posts: gap-buffer: add deletion --- content/posts/2024-07-06-gap-buffer/index.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/content/posts/2024-07-06-gap-buffer/index.md b/content/posts/2024-07-06-gap-buffer/index.md index b23f21a..9ca44ea 100644 --- a/content/posts/2024-07-06-gap-buffer/index.md +++ b/content/posts/2024-07-06-gap-buffer/index.md @@ -140,3 +140,22 @@ def insert(self, val: str) -> None: self._buf[self._gap_start : self._gap_start + len(val)] = val self._gap_start += len(val) ``` + +### Deletion + +Removing text from the buffer simply expands the gap in the corresponding +direction, shortening the string's prefix/suffix. This makes it very cheap. + +The methods are named after the `backspace` and `delete` keys on the keyboard. + +```python +def backspace(self, dist: int = 1) -> None: + assert dist <= self.prefix_length + # Extend gap to the left + self._gap_start -= dist + +def delete(self, dist: int = 1) -> None: + assert dist <= self.suffix_length + # Extend gap to the right + self._gap_end += dist +``` From a0e20dd341261cb66ad231bf6cddf92e35b96f24 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 30 Jun 2024 12:38:01 +0100 Subject: [PATCH 13/16] posts: trie: add fuzzy matching --- content/posts/2024-06-30-trie/index.md | 55 ++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/content/posts/2024-06-30-trie/index.md b/content/posts/2024-06-30-trie/index.md index 2a0d77e..aef49e3 100644 --- a/content/posts/2024-06-30-trie/index.md +++ b/content/posts/2024-06-30-trie/index.md @@ -114,3 +114,58 @@ def remove(self, key: str) -> bool: # Otherwise, recurse on the child corresponding to the first letter return self._children[key[0]].remove(key[1:]) ``` + +### Fuzzy matching + +Fuzzily matching a given word is where the real difficulty is: the key is to +realize we can use the prefix-tree nature of a _Trie_ to avoid doing wasteful +work. + +By leveraging the prefix visit order of the tree, we can build an iterative +Levenshtein distance matrix, in much the same way one would do so in its +[Dynamic Programming] implementation (see the [Wagner-Fisher algorithm]). + +[Dynamic Programming]: https://en.wikipedia.org/wiki/Dynamic_programming +[Wagner-Fisher algorithm]: https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm + +```python +class FuzzyResult[T](NamedTuple): + distance: int + key: str + value: T + + +def get_fuzzy(self, key: str, max_distance: int = 0) -> Iterator[FuzzyResult[T]]: + def helper( + current_word: str, + node: Trie[T], + previous_row: list[int], + ) -> Iterator[tuple[int, T]]: + # Iterative Levenshtein + current_row = [previous_row[0] + 1] + current_char = current_word[-1] + for column, key_char in enumerate(key, start=1): + insertion = current_row[column - 1] + 1 + deletion = previous_row[column] + 1 + replacement = previous_row[column - 1] + (key_char != current_char) + current_row.append(min(insertion, deletion, replacement)) + + # If we are under the max distance, match this node + if (distance := current_row[-1]) <= max_distance and node._value != None: + # Only if it has a value of course + yield FuzzyResult(distance, current_word, node._value) + + # If we can potentially still match children, recurse + if min(current_row) <= max_distance: + for c, child in node._children.items(): + yield from helper(current_word + c, child, current_row) + + # Build the first row -- the edit distance from the empty string + row = list(range(len(key) + 1)) + + # Base case for the empty string + if (distance := row[-1]) <= max_distance and self._value != None: + yield FuzzyResult(distance, "", self._value) + for c, child in self._children.items(): + yield from helper(c, child, row) +``` From 2c31c1aff294231f18f0d2df9a96e4c9878ae5ee Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 14 Jul 2024 17:55:33 +0100 Subject: [PATCH 14/16] posts: bloom-filter: add insertion --- content/posts/2024-07-14-bloom-filter/index.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/content/posts/2024-07-14-bloom-filter/index.md b/content/posts/2024-07-14-bloom-filter/index.md index 547d50f..1d593a7 100644 --- a/content/posts/2024-07-14-bloom-filter/index.md +++ b/content/posts/2024-07-14-bloom-filter/index.md @@ -62,3 +62,18 @@ class BloomFilter[T]: self._bits = 0 self._hash_functions = hash_functions ``` + +### Inserting a key + +To add an element to the filter, we take the output from each hash function and +use that to set a bit in the filter. This combination of bit will identify the +element, which we can use for lookup later. + +```python +def insert(self, val: T) -> None: + # Iterate over each hash + for f in self._hash_functions: + n = f(val) % BIT_COUNT + # Set the corresponding bit + self._bit |= 1 << n +``` From d1a67510ef975d54b70d18ba368e9d7b37709874 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sat, 6 Jul 2024 23:41:31 +0100 Subject: [PATCH 15/16] posts: gap-buffer: add movement --- content/posts/2024-07-06-gap-buffer/index.md | 30 ++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/content/posts/2024-07-06-gap-buffer/index.md b/content/posts/2024-07-06-gap-buffer/index.md index 9ca44ea..763628d 100644 --- a/content/posts/2024-07-06-gap-buffer/index.md +++ b/content/posts/2024-07-06-gap-buffer/index.md @@ -159,3 +159,33 @@ def delete(self, dist: int = 1) -> None: # Extend gap to the right self._gap_end += dist ``` + +### Moving the cursor + +Moving the cursor along the buffer will shift letters from one side of the gap +to the other, moving them accross from prefix to suffix and back. + +I find Python's list slicing not quite as elegant to read as a `memmove`, though +it does make for a very small and efficient implementation. + +```python +def left(self, dist: int = 1) -> None: + assert dist <= self.prefix_length + # Shift the needed number of characters from end of prefix to start of suffix + self._buf[self._gap_end - dist : self._gap_end] = self._buf[ + self._gap_start - dist : self._gap_start + ] + # Adjust indices accordingly + self._gap_start -= dist + self._gap_end -= dist + +def right(self, dist: int = 1) -> None: + assert dist <= self.suffix_length + # Shift the needed number of characters from start of suffix to end of prefix + self._buf[self._gap_start : self._gap_start + dist] = self._buf[ + self._gap_end : self._gap_end + dist + ] + # Adjust indices accordingly + self._gap_start += dist + self._gap_end += dist +``` From 27152689eaae20208cd390e980255d66b09bd0f3 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 14 Jul 2024 17:56:33 +0100 Subject: [PATCH 16/16] posts: bloom-filter: add lookup --- content/posts/2024-07-14-bloom-filter/index.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/content/posts/2024-07-14-bloom-filter/index.md b/content/posts/2024-07-14-bloom-filter/index.md index 1d593a7..93107d4 100644 --- a/content/posts/2024-07-14-bloom-filter/index.md +++ b/content/posts/2024-07-14-bloom-filter/index.md @@ -77,3 +77,21 @@ def insert(self, val: T) -> None: # Set the corresponding bit self._bit |= 1 << n ``` + +### Querying a key + +Because the _Bloom Filter_ does not actually store its elements, but some +derived data from hashing them, it can only definitely say if an element _does +not_ belong to it. Otherwise, it _may_ be part of the set, and should be checked +against the actual underlying store. + +```python +def may_contain(self, val: T) -> bool: + for f in self._hash_functions: + n = f(val) % BIT_COUNT + # If one of the bits is unset, the value is definitely not present + if not (self._bit & (1 << n)): + return False + # All bits were matched, `val` is likely to be part of the set + return True +```