From 8e5ac9d1e69de327f6e3b5c3bf2873be329d537b Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 30 Jun 2024 12:35:08 +0100 Subject: [PATCH 1/7] posts: add trie --- content/posts/2024-06-30-trie/index.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 content/posts/2024-06-30-trie/index.md diff --git a/content/posts/2024-06-30-trie/index.md b/content/posts/2024-06-30-trie/index.md new file mode 100644 index 0000000..a4e8959 --- /dev/null +++ b/content/posts/2024-06-30-trie/index.md @@ -0,0 +1,23 @@ +--- +title: "Trie" +date: 2024-06-30T11:07:49+01:00 +draft: false # I don't care for draft mode, git has branches for that +description: "A cool map" +tags: + - algorithms + - data structures + - python +categories: + - programming +series: + - Cool algorithms +favorite: false +disable_feed: false +--- + +This time, let's talk about the [_Trie_][wiki], which is a tree-based mapping +structure most often used for string keys. + +[wiki]: https://en.wikipedia.org/wiki/Trie + + From 53b968e36ce899aaf64537ccb15492bee9c10319 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 30 Jun 2024 12:36:17 +0100 Subject: [PATCH 2/7] posts: trie: add presentation --- content/posts/2024-06-30-trie/index.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/content/posts/2024-06-30-trie/index.md b/content/posts/2024-06-30-trie/index.md index a4e8959..32b6fb8 100644 --- a/content/posts/2024-06-30-trie/index.md +++ b/content/posts/2024-06-30-trie/index.md @@ -21,3 +21,16 @@ structure most often used for string keys. [wiki]: https://en.wikipedia.org/wiki/Trie + +## What does it do? + +A _Trie_ can be used to map a set of string keys to their corresponding values, +without the need for a hash function. This also means you won't suffer from hash +collisions, though the tree-based structure will probably translate to slower +performance than a good hash table. + +A _Trie_ is especially useful to represent a dictionary of words in the case of +spell correction, as it can easily be used to fuzzy match words under a given +edit distance (think [Levenshtein distance]) + +[Levenshtein distance]: https://en.wikipedia.org/wiki/Levenshtein_distance From 674410694028d4e6413d2c82a638b1d755e024de Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 30 Jun 2024 12:36:42 +0100 Subject: [PATCH 3/7] posts: trie: add construction --- content/posts/2024-06-30-trie/index.md | 29 ++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/content/posts/2024-06-30-trie/index.md b/content/posts/2024-06-30-trie/index.md index 32b6fb8..584e856 100644 --- a/content/posts/2024-06-30-trie/index.md +++ b/content/posts/2024-06-30-trie/index.md @@ -34,3 +34,32 @@ spell correction, as it can easily be used to fuzzy match words under a given edit distance (think [Levenshtein distance]) [Levenshtein distance]: https://en.wikipedia.org/wiki/Levenshtein_distance + +## Implementation + +This implementation will be in Python for exposition purposes, even though +it already has a built-in `dict`. + +### Representation + +Creating a new `Trie` is easy: the root node starts off empty and without any +mapped values. + +```python +class Trie[T]: + _children: dict[str, Trie[T]] + _value: T | None + + def __init__(self): + # Each letter is mapped to a Trie + self._children = defaultdict(Trie) + # If we match a full string, we store the mapped value + self._value = None +``` + +We're using a `defaultdict` for the children for ease of implementation in this +post. In reality, I would encourage you exit early when you can't match a given +character. + +The string key will be implicit by the position of a node in the tree: the empty +string at the root, one-character strings as its direct children, etc... From 7e9fd69cced39d50ff0195a85ec33d19a391b18c Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 30 Jun 2024 12:37:04 +0100 Subject: [PATCH 4/7] posts: trie: add search --- content/posts/2024-06-30-trie/index.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/content/posts/2024-06-30-trie/index.md b/content/posts/2024-06-30-trie/index.md index 584e856..968aa0e 100644 --- a/content/posts/2024-06-30-trie/index.md +++ b/content/posts/2024-06-30-trie/index.md @@ -63,3 +63,18 @@ character. The string key will be implicit by the position of a node in the tree: the empty string at the root, one-character strings as its direct children, etc... + +### Search + +An exact match look-up is easily done: we go down the tree until we've exhausted +the key. At that point we've either found a mapped value or not. + +```python +def get(self, key: str) -> T | None: + # Have we matched the full key? + if not key: + # Store the `T` if mapped, `None` otherwise + return self._value + # Otherwise, recurse on the child corresponding to the first letter + return self._children[key[0]].get(key[1:]) +``` From 55982909d2ce8b7f3cc67de3d97f1982316ecc41 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 30 Jun 2024 12:37:21 +0100 Subject: [PATCH 5/7] posts: trie: add insertion --- content/posts/2024-06-30-trie/index.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/content/posts/2024-06-30-trie/index.md b/content/posts/2024-06-30-trie/index.md index 968aa0e..b8e4679 100644 --- a/content/posts/2024-06-30-trie/index.md +++ b/content/posts/2024-06-30-trie/index.md @@ -78,3 +78,22 @@ def get(self, key: str) -> T | None: # Otherwise, recurse on the child corresponding to the first letter return self._children[key[0]].get(key[1:]) ``` + +### Insertion + +Adding a new value to the _Trie_ is similar to a key lookup, only this time we +store the new value instead of returning it. + +```python +def insert(self, key: str, value: T) -> bool: + # Have we matched the full key? + if not key: + # Check whether we're overwriting a previous mapping + was_mapped = self._value is None + # Store the corresponding value + self._value = value + # Return whether we've performed an overwrite + return was_mapped + # Otherwise, recurse on the child corresponding to the first letter + return self._children[key[0]].insert(key[1:], value) +``` From 1d37e00b3a9afe578ef4441e0b49ac375dc62a03 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 30 Jun 2024 12:37:48 +0100 Subject: [PATCH 6/7] posts: trie: add removal --- content/posts/2024-06-30-trie/index.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/content/posts/2024-06-30-trie/index.md b/content/posts/2024-06-30-trie/index.md index b8e4679..2a0d77e 100644 --- a/content/posts/2024-06-30-trie/index.md +++ b/content/posts/2024-06-30-trie/index.md @@ -97,3 +97,20 @@ def insert(self, key: str, value: T) -> bool: # Otherwise, recurse on the child corresponding to the first letter return self._children[key[0]].insert(key[1:], value) ``` + +### Removal + +Removal should also look familiar. + +```python +def remove(self, key: str) -> bool: + # Have we matched the full key? + if not key: + was_mapped = self._value is None + # Remove the value + self._value = None + # Return whether it was mapped + return was_mapped + # Otherwise, recurse on the child corresponding to the first letter + return self._children[key[0]].remove(key[1:]) +``` From a0e20dd341261cb66ad231bf6cddf92e35b96f24 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Sun, 30 Jun 2024 12:38:01 +0100 Subject: [PATCH 7/7] posts: trie: add fuzzy matching --- content/posts/2024-06-30-trie/index.md | 55 ++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/content/posts/2024-06-30-trie/index.md b/content/posts/2024-06-30-trie/index.md index 2a0d77e..aef49e3 100644 --- a/content/posts/2024-06-30-trie/index.md +++ b/content/posts/2024-06-30-trie/index.md @@ -114,3 +114,58 @@ def remove(self, key: str) -> bool: # Otherwise, recurse on the child corresponding to the first letter return self._children[key[0]].remove(key[1:]) ``` + +### Fuzzy matching + +Fuzzily matching a given word is where the real difficulty is: the key is to +realize we can use the prefix-tree nature of a _Trie_ to avoid doing wasteful +work. + +By leveraging the prefix visit order of the tree, we can build an iterative +Levenshtein distance matrix, in much the same way one would do so in its +[Dynamic Programming] implementation (see the [Wagner-Fisher algorithm]). + +[Dynamic Programming]: https://en.wikipedia.org/wiki/Dynamic_programming +[Wagner-Fisher algorithm]: https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm + +```python +class FuzzyResult[T](NamedTuple): + distance: int + key: str + value: T + + +def get_fuzzy(self, key: str, max_distance: int = 0) -> Iterator[FuzzyResult[T]]: + def helper( + current_word: str, + node: Trie[T], + previous_row: list[int], + ) -> Iterator[tuple[int, T]]: + # Iterative Levenshtein + current_row = [previous_row[0] + 1] + current_char = current_word[-1] + for column, key_char in enumerate(key, start=1): + insertion = current_row[column - 1] + 1 + deletion = previous_row[column] + 1 + replacement = previous_row[column - 1] + (key_char != current_char) + current_row.append(min(insertion, deletion, replacement)) + + # If we are under the max distance, match this node + if (distance := current_row[-1]) <= max_distance and node._value != None: + # Only if it has a value of course + yield FuzzyResult(distance, current_word, node._value) + + # If we can potentially still match children, recurse + if min(current_row) <= max_distance: + for c, child in node._children.items(): + yield from helper(current_word + c, child, current_row) + + # Build the first row -- the edit distance from the empty string + row = list(range(len(key) + 1)) + + # Base case for the empty string + if (distance := row[-1]) <= max_distance and self._value != None: + yield FuzzyResult(distance, "", self._value) + for c, child in self._children.items(): + yield from helper(c, child, row) +```