Compare commits
19 commits
763ee444d4
...
189cdcf05a
Author | SHA1 | Date | |
---|---|---|---|
Bruno BELANYI | 189cdcf05a | ||
Bruno BELANYI | de48eb9e94 | ||
Bruno BELANYI | 27152689ea | ||
Bruno BELANYI | 8e304ec8a9 | ||
Bruno BELANYI | d1a67510ef | ||
Bruno BELANYI | 2c31c1aff2 | ||
Bruno BELANYI | a0e20dd341 | ||
Bruno BELANYI | e05ed1cc4a | ||
Bruno BELANYI | 798116716f | ||
Bruno BELANYI | 1d37e00b3a | ||
Bruno BELANYI | 72057a3224 | ||
Bruno BELANYI | 3992996a89 | ||
Bruno BELANYI | 0084c8717a | ||
Bruno BELANYI | f4a64b2a37 | ||
Bruno BELANYI | 4d69be0633 | ||
Bruno BELANYI | 091e8527e3 | ||
Bruno BELANYI | a4976aeefb | ||
Bruno BELANYI | 239d5c3dbd | ||
Bruno BELANYI | 55982909d2 |
|
@ -78,3 +78,94 @@ def get(self, key: str) -> T | None:
|
||||||
# Otherwise, recurse on the child corresponding to the first letter
|
# Otherwise, recurse on the child corresponding to the first letter
|
||||||
return self._children[key[0]].get(key[1:])
|
return self._children[key[0]].get(key[1:])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Insertion
|
||||||
|
|
||||||
|
Adding a new value to the _Trie_ is similar to a key lookup, only this time we
|
||||||
|
store the new value instead of returning it.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def insert(self, key: str, value: T) -> bool:
|
||||||
|
# Have we matched the full key?
|
||||||
|
if not key:
|
||||||
|
# Check whether we're overwriting a previous mapping
|
||||||
|
was_mapped = self._value is None
|
||||||
|
# Store the corresponding value
|
||||||
|
self._value = value
|
||||||
|
# Return whether we've performed an overwrite
|
||||||
|
return was_mapped
|
||||||
|
# Otherwise, recurse on the child corresponding to the first letter
|
||||||
|
return self._children[key[0]].insert(key[1:], value)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Removal
|
||||||
|
|
||||||
|
Removal should also look familiar.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def remove(self, key: str) -> bool:
|
||||||
|
# Have we matched the full key?
|
||||||
|
if not key:
|
||||||
|
was_mapped = self._value is None
|
||||||
|
# Remove the value
|
||||||
|
self._value = None
|
||||||
|
# Return whether it was mapped
|
||||||
|
return was_mapped
|
||||||
|
# Otherwise, recurse on the child corresponding to the first letter
|
||||||
|
return self._children[key[0]].remove(key[1:])
|
||||||
|
```
|
||||||
|
|
||||||
|
### Fuzzy matching
|
||||||
|
|
||||||
|
Fuzzily matching a given word is where the real difficulty is: the key is to
|
||||||
|
realize we can use the prefix-tree nature of a _Trie_ to avoid doing wasteful
|
||||||
|
work.
|
||||||
|
|
||||||
|
By leveraging the prefix visit order of the tree, we can build an iterative
|
||||||
|
Levenshtein distance matrix, in much the same way one would do so in its
|
||||||
|
[Dynamic Programming] implementation (see the [Wagner-Fisher algorithm]).
|
||||||
|
|
||||||
|
[Dynamic Programming]: https://en.wikipedia.org/wiki/Dynamic_programming
|
||||||
|
[Wagner-Fisher algorithm]: https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm
|
||||||
|
|
||||||
|
```python
|
||||||
|
class FuzzyResult[T](NamedTuple):
|
||||||
|
distance: int
|
||||||
|
key: str
|
||||||
|
value: T
|
||||||
|
|
||||||
|
|
||||||
|
def get_fuzzy(self, key: str, max_distance: int = 0) -> Iterator[FuzzyResult[T]]:
|
||||||
|
def helper(
|
||||||
|
current_word: str,
|
||||||
|
node: Trie[T],
|
||||||
|
previous_row: list[int],
|
||||||
|
) -> Iterator[tuple[int, T]]:
|
||||||
|
# Iterative Levenshtein
|
||||||
|
current_row = [previous_row[0] + 1]
|
||||||
|
current_char = current_word[-1]
|
||||||
|
for column, key_char in enumerate(key, start=1):
|
||||||
|
insertion = current_row[column - 1] + 1
|
||||||
|
deletion = previous_row[column] + 1
|
||||||
|
replacement = previous_row[column - 1] + (key_char != current_char)
|
||||||
|
current_row.append(min(insertion, deletion, replacement))
|
||||||
|
|
||||||
|
# If we are under the max distance, match this node
|
||||||
|
if (distance := current_row[-1]) <= max_distance and node._value != None:
|
||||||
|
# Only if it has a value of course
|
||||||
|
yield FuzzyResult(distance, current_word, node._value)
|
||||||
|
|
||||||
|
# If we can potentially still match children, recurse
|
||||||
|
if min(current_row) <= max_distance:
|
||||||
|
for c, child in node._children.items():
|
||||||
|
yield from helper(current_word + c, child, current_row)
|
||||||
|
|
||||||
|
# Build the first row -- the edit distance from the empty string
|
||||||
|
row = list(range(len(key) + 1))
|
||||||
|
|
||||||
|
# Base case for the empty string
|
||||||
|
if (distance := row[-1]) <= max_distance and self._value != None:
|
||||||
|
yield FuzzyResult(distance, "", self._value)
|
||||||
|
for c, child in self._children.items():
|
||||||
|
yield from helper(c, child, row)
|
||||||
|
```
|
||||||
|
|
191
content/posts/2024-07-06-gap-buffer/index.md
Normal file
191
content/posts/2024-07-06-gap-buffer/index.md
Normal file
|
@ -0,0 +1,191 @@
|
||||||
|
---
|
||||||
|
title: "Gap Buffer"
|
||||||
|
date: 2024-07-06T21:27:19+01:00
|
||||||
|
draft: false # I don't care for draft mode, git has branches for that
|
||||||
|
description: "As featured in GNU Emacs"
|
||||||
|
tags:
|
||||||
|
- algorithms
|
||||||
|
- data structures
|
||||||
|
- python
|
||||||
|
categories:
|
||||||
|
- programming
|
||||||
|
series:
|
||||||
|
- Cool algorithms
|
||||||
|
favorite: false
|
||||||
|
disable_feed: false
|
||||||
|
---
|
||||||
|
|
||||||
|
The [_Gap Buffer_][wiki] is a popular data structure for text editors to
|
||||||
|
represent files and editable buffers. The most famous of them probably being
|
||||||
|
[GNU Emacs][emacs].
|
||||||
|
|
||||||
|
[wiki]: https://en.wikipedia.org/wiki/Gap_buffer
|
||||||
|
[emacs]: https://www.gnu.org/software/emacs/manual/html_node/elisp/Buffer-Gap.html
|
||||||
|
|
||||||
|
<!--more-->
|
||||||
|
|
||||||
|
## What does it do?
|
||||||
|
|
||||||
|
A _Gap Buffer_ is simply a list of characters, similar to a normal string, with
|
||||||
|
the added twist of splitting it into two side: the prefix and suffix, on either
|
||||||
|
side of the cursor. In between them, a gap is left to allow for quick
|
||||||
|
insertion at the cursor.
|
||||||
|
|
||||||
|
Moving the cursor moves the gap around the buffer, the prefix and suffix getting
|
||||||
|
shorter/longer as required.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
I'll be writing a sample implementation in Python, as with the rest of the
|
||||||
|
[series]({{< ref "/series/cool-algorithms/">}}). I don't think it showcases the
|
||||||
|
elegance of the _Gap Buffer_ in action like a C implementation full of
|
||||||
|
`memmove`s would, but it does makes it short and sweet.
|
||||||
|
|
||||||
|
### Representation
|
||||||
|
|
||||||
|
We'll be representing the gap buffer as an actual list of characters.
|
||||||
|
|
||||||
|
Given that Python doesn't _have_ characters, let's settle for a list of strings,
|
||||||
|
each representing a single character...
|
||||||
|
|
||||||
|
```python
|
||||||
|
Char = str
|
||||||
|
|
||||||
|
class GapBuffer:
|
||||||
|
# List of characters, contains prefix and suffix of string with gap in the middle
|
||||||
|
_buf: list[Char]
|
||||||
|
# The gap is contained between [start, end) (i.e: buf[start:end])
|
||||||
|
_gap_start: int
|
||||||
|
_gap_end: int
|
||||||
|
|
||||||
|
# Visual representation of the gap buffer:
|
||||||
|
# This is a very [ ]long string.
|
||||||
|
# |<----------------------------------------------->| capacity
|
||||||
|
# |<------------>| |<-------->| string
|
||||||
|
# |<------------------->| gap
|
||||||
|
# |<------------>| prefix
|
||||||
|
# |<-------->| suffix
|
||||||
|
def __init__(self, initial_capacity: int = 16) -> None:
|
||||||
|
assert initial_capacity > 0
|
||||||
|
# Initialize an empty gap buffer
|
||||||
|
self._buf = [""] * initial_capacity
|
||||||
|
self._gap_start = 0
|
||||||
|
self._gap_end = initial_capacity
|
||||||
|
```
|
||||||
|
|
||||||
|
### Accessors
|
||||||
|
|
||||||
|
I'm mostly adding these for exposition, and making it easier to write `assert`s
|
||||||
|
later.
|
||||||
|
|
||||||
|
```python
|
||||||
|
@property
|
||||||
|
def capacity(self) -> int:
|
||||||
|
return len(self._buf)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def gap_length(self) -> int:
|
||||||
|
return self._gap_end - self._gap_start
|
||||||
|
|
||||||
|
@property
|
||||||
|
def string_length(self) -> int:
|
||||||
|
return self.capacity - self.gap_length
|
||||||
|
|
||||||
|
@property
|
||||||
|
def prefix_length(self) -> int:
|
||||||
|
return self._gap_start
|
||||||
|
|
||||||
|
@property
|
||||||
|
def suffix_length(self) -> int:
|
||||||
|
return self.capacity - self._gap_end
|
||||||
|
```
|
||||||
|
|
||||||
|
### Growing the buffer
|
||||||
|
|
||||||
|
I've written this method in a somewhat non-idiomatic manner, to make it closer
|
||||||
|
to how it would look in C using `realloc` instead.
|
||||||
|
|
||||||
|
It would be more efficient to use slicing to insert the needed extra capacity
|
||||||
|
directly, instead of making a new buffer and copying characters over.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def grow(self, capacity: int) -> None:
|
||||||
|
assert capacity >= self.capacity
|
||||||
|
# Create a new buffer with the new capacity
|
||||||
|
new_buf = [""] * capacity
|
||||||
|
# Move the prefix/suffix to their place in the new buffer
|
||||||
|
added_capacity = capacity - len(self._buf)
|
||||||
|
new_buf[: self._gap_start] = self._buf[: self._gap_start]
|
||||||
|
new_buf[self._gap_end + added_capacity :] = self._buf[self._gap_end :]
|
||||||
|
# Use the new buffer, account for added capacity
|
||||||
|
self._buf = new_buf
|
||||||
|
self._gap_end += added_capacity
|
||||||
|
```
|
||||||
|
|
||||||
|
### Insertion
|
||||||
|
|
||||||
|
Inserting text at the cursor's position means filling up the gap in the middle
|
||||||
|
of the buffer. To do so we must first make sure that the gap is big enough, or
|
||||||
|
grow the buffer accordingly.
|
||||||
|
|
||||||
|
Then inserting the text is simply a matter of copying its characters in place,
|
||||||
|
and moving the start of the gap further right.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def insert(self, val: str) -> None:
|
||||||
|
# Ensure we have enouh space to insert the whole string
|
||||||
|
if len(val) > self.gap_length:
|
||||||
|
self.grow(max(self.capacity * 2, self.string_length + len(val)))
|
||||||
|
# Fill the gap with the given string
|
||||||
|
self._buf[self._gap_start : self._gap_start + len(val)] = val
|
||||||
|
self._gap_start += len(val)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Deletion
|
||||||
|
|
||||||
|
Removing text from the buffer simply expands the gap in the corresponding
|
||||||
|
direction, shortening the string's prefix/suffix. This makes it very cheap.
|
||||||
|
|
||||||
|
The methods are named after the `backspace` and `delete` keys on the keyboard.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def backspace(self, dist: int = 1) -> None:
|
||||||
|
assert dist <= self.prefix_length
|
||||||
|
# Extend gap to the left
|
||||||
|
self._gap_start -= dist
|
||||||
|
|
||||||
|
def delete(self, dist: int = 1) -> None:
|
||||||
|
assert dist <= self.suffix_length
|
||||||
|
# Extend gap to the right
|
||||||
|
self._gap_end += dist
|
||||||
|
```
|
||||||
|
|
||||||
|
### Moving the cursor
|
||||||
|
|
||||||
|
Moving the cursor along the buffer will shift letters from one side of the gap
|
||||||
|
to the other, moving them accross from prefix to suffix and back.
|
||||||
|
|
||||||
|
I find Python's list slicing not quite as elegant to read as a `memmove`, though
|
||||||
|
it does make for a very small and efficient implementation.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def left(self, dist: int = 1) -> None:
|
||||||
|
assert dist <= self.prefix_length
|
||||||
|
# Shift the needed number of characters from end of prefix to start of suffix
|
||||||
|
self._buf[self._gap_end - dist : self._gap_end] = self._buf[
|
||||||
|
self._gap_start - dist : self._gap_start
|
||||||
|
]
|
||||||
|
# Adjust indices accordingly
|
||||||
|
self._gap_start -= dist
|
||||||
|
self._gap_end -= dist
|
||||||
|
|
||||||
|
def right(self, dist: int = 1) -> None:
|
||||||
|
assert dist <= self.suffix_length
|
||||||
|
# Shift the needed number of characters from start of suffix to end of prefix
|
||||||
|
self._buf[self._gap_start : self._gap_start + dist] = self._buf[
|
||||||
|
self._gap_end : self._gap_end + dist
|
||||||
|
]
|
||||||
|
# Adjust indices accordingly
|
||||||
|
self._gap_start += dist
|
||||||
|
self._gap_end += dist
|
||||||
|
```
|
97
content/posts/2024-07-14-bloom-filter/index.md
Normal file
97
content/posts/2024-07-14-bloom-filter/index.md
Normal file
|
@ -0,0 +1,97 @@
|
||||||
|
---
|
||||||
|
title: "Bloom Filter"
|
||||||
|
date: 2024-07-14T17:46:40+01:00
|
||||||
|
draft: false # I don't care for draft mode, git has branches for that
|
||||||
|
description: "Probably cool"
|
||||||
|
tags:
|
||||||
|
- algorithms
|
||||||
|
- data structures
|
||||||
|
- python
|
||||||
|
categories:
|
||||||
|
- programming
|
||||||
|
series:
|
||||||
|
- Cool algorithms
|
||||||
|
favorite: false
|
||||||
|
disable_feed: false
|
||||||
|
---
|
||||||
|
|
||||||
|
The [_Bloom Filter_][wiki] is a probabilistic data structure for set membership.
|
||||||
|
|
||||||
|
The filter can be used as an inexpensive first step when querying the actual
|
||||||
|
data is quite costly (e.g: as a first check for expensive cache lookups or large
|
||||||
|
data seeks).
|
||||||
|
|
||||||
|
[wiki]: https://en.wikipedia.org/wiki/Bloom_filter
|
||||||
|
|
||||||
|
<!--more-->
|
||||||
|
|
||||||
|
## What does it do?
|
||||||
|
|
||||||
|
A _Bloom Filter_ can be understood as a hash-set which can either tell you:
|
||||||
|
|
||||||
|
* An element is _not_ part of the set.
|
||||||
|
* An element _may be_ part of the set.
|
||||||
|
|
||||||
|
More specifically, one can tweak the parameters of the filter to make it so that
|
||||||
|
the _false positive_ rate of membership is quite low.
|
||||||
|
|
||||||
|
I won't be going into those calculations here, but they are quite trivial to
|
||||||
|
compute, or one can just look up appropriate values for their use case.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
I'll be using Python, which has the nifty ability of representing bitsets
|
||||||
|
through its built-in big integers quite easily.
|
||||||
|
|
||||||
|
We'll be assuming a `BIT_COUNT` of 64 here, but the implementation can easily be
|
||||||
|
tweaked to use a different number, or even change it at construction time.
|
||||||
|
|
||||||
|
### Representation
|
||||||
|
|
||||||
|
A `BloomFilter` is just a set of bits and a list of hash functions.
|
||||||
|
|
||||||
|
```python
|
||||||
|
BIT_COUNT = 64
|
||||||
|
|
||||||
|
class BloomFilter[T]:
|
||||||
|
_bits: int
|
||||||
|
_hash_functions: list[Callable[[T], int]]
|
||||||
|
|
||||||
|
def __init__(self, hash_functions: list[Callable[[T], int]]) -> None:
|
||||||
|
# Filter is initially empty
|
||||||
|
self._bits = 0
|
||||||
|
self._hash_functions = hash_functions
|
||||||
|
```
|
||||||
|
|
||||||
|
### Inserting a key
|
||||||
|
|
||||||
|
To add an element to the filter, we take the output from each hash function and
|
||||||
|
use that to set a bit in the filter. This combination of bit will identify the
|
||||||
|
element, which we can use for lookup later.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def insert(self, val: T) -> None:
|
||||||
|
# Iterate over each hash
|
||||||
|
for f in self._hash_functions:
|
||||||
|
n = f(val) % BIT_COUNT
|
||||||
|
# Set the corresponding bit
|
||||||
|
self._bit |= 1 << n
|
||||||
|
```
|
||||||
|
|
||||||
|
### Querying a key
|
||||||
|
|
||||||
|
Because the _Bloom Filter_ does not actually store its elements, but some
|
||||||
|
derived data from hashing them, it can only definitely say if an element _does
|
||||||
|
not_ belong to it. Otherwise, it _may_ be part of the set, and should be checked
|
||||||
|
against the actual underlying store.
|
||||||
|
|
||||||
|
```python
|
||||||
|
def may_contain(self, val: T) -> bool:
|
||||||
|
for f in self._hash_functions:
|
||||||
|
n = f(val) % BIT_COUNT
|
||||||
|
# If one of the bits is unset, the value is definitely not present
|
||||||
|
if not (self._bit & (1 << n)):
|
||||||
|
return False
|
||||||
|
# All bits were matched, `val` is likely to be part of the set
|
||||||
|
return True
|
||||||
|
```
|
Loading…
Reference in a new issue