posts: trie: add fuzzy matching
This commit is contained in:
parent
1d37e00b3a
commit
a0e20dd341
|
@ -114,3 +114,58 @@ def remove(self, key: str) -> bool:
|
||||||
# Otherwise, recurse on the child corresponding to the first letter
|
# Otherwise, recurse on the child corresponding to the first letter
|
||||||
return self._children[key[0]].remove(key[1:])
|
return self._children[key[0]].remove(key[1:])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Fuzzy matching
|
||||||
|
|
||||||
|
Fuzzily matching a given word is where the real difficulty is: the key is to
|
||||||
|
realize we can use the prefix-tree nature of a _Trie_ to avoid doing wasteful
|
||||||
|
work.
|
||||||
|
|
||||||
|
By leveraging the prefix visit order of the tree, we can build an iterative
|
||||||
|
Levenshtein distance matrix, in much the same way one would do so in its
|
||||||
|
[Dynamic Programming] implementation (see the [Wagner-Fisher algorithm]).
|
||||||
|
|
||||||
|
[Dynamic Programming]: https://en.wikipedia.org/wiki/Dynamic_programming
|
||||||
|
[Wagner-Fisher algorithm]: https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm
|
||||||
|
|
||||||
|
```python
|
||||||
|
class FuzzyResult[T](NamedTuple):
|
||||||
|
distance: int
|
||||||
|
key: str
|
||||||
|
value: T
|
||||||
|
|
||||||
|
|
||||||
|
def get_fuzzy(self, key: str, max_distance: int = 0) -> Iterator[FuzzyResult[T]]:
|
||||||
|
def helper(
|
||||||
|
current_word: str,
|
||||||
|
node: Trie[T],
|
||||||
|
previous_row: list[int],
|
||||||
|
) -> Iterator[tuple[int, T]]:
|
||||||
|
# Iterative Levenshtein
|
||||||
|
current_row = [previous_row[0] + 1]
|
||||||
|
current_char = current_word[-1]
|
||||||
|
for column, key_char in enumerate(key, start=1):
|
||||||
|
insertion = current_row[column - 1] + 1
|
||||||
|
deletion = previous_row[column] + 1
|
||||||
|
replacement = previous_row[column - 1] + (key_char != current_char)
|
||||||
|
current_row.append(min(insertion, deletion, replacement))
|
||||||
|
|
||||||
|
# If we are under the max distance, match this node
|
||||||
|
if (distance := current_row[-1]) <= max_distance and node._value != None:
|
||||||
|
# Only if it has a value of course
|
||||||
|
yield FuzzyResult(distance, current_word, node._value)
|
||||||
|
|
||||||
|
# If we can potentially still match children, recurse
|
||||||
|
if min(current_row) <= max_distance:
|
||||||
|
for c, child in node._children.items():
|
||||||
|
yield from helper(current_word + c, child, current_row)
|
||||||
|
|
||||||
|
# Build the first row -- the edit distance from the empty string
|
||||||
|
row = list(range(len(key) + 1))
|
||||||
|
|
||||||
|
# Base case for the empty string
|
||||||
|
if (distance := row[-1]) <= max_distance and self._value != None:
|
||||||
|
yield FuzzyResult(distance, "", self._value)
|
||||||
|
for c, child in self._children.items():
|
||||||
|
yield from helper(c, child, row)
|
||||||
|
```
|
||||||
|
|
Loading…
Reference in a new issue