From 13f014af9063425ce62220b7e1aac7c9ebec842b Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Fri, 2 Aug 2024 21:03:47 +0100 Subject: [PATCH 1/5] config: enable MathJax --- config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/config.yaml b/config.yaml index c0927fc..b0aadf8 100644 --- a/config.yaml +++ b/config.yaml @@ -67,6 +67,7 @@ params: webmentions: login: belanyi.fr pingback: true + mathjax: true taxonomies: category: "categories" From accd30a9d861b5dab6cc56874db13f31e02ed9e0 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Fri, 2 Aug 2024 21:10:39 +0100 Subject: [PATCH 2/5] markdownlint: relax duplicate header check --- .markdownlint.yaml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .markdownlint.yaml diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 0000000..419c334 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,3 @@ +# MD024/no-duplicate-heading/no-duplicate-header +MD024: + siblings_only: true From 2dfbe3f0aa73a2e4463d2ed815058878817c5687 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Fri, 2 Aug 2024 21:04:30 +0100 Subject: [PATCH 3/5] posts: add 'reservoir-sampling' --- .../2024-08-02-reservoir-sampling/index.md | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 content/posts/2024-08-02-reservoir-sampling/index.md diff --git a/content/posts/2024-08-02-reservoir-sampling/index.md b/content/posts/2024-08-02-reservoir-sampling/index.md new file mode 100644 index 0000000..795c6c1 --- /dev/null +++ b/content/posts/2024-08-02-reservoir-sampling/index.md @@ -0,0 +1,27 @@ +--- +title: "Reservoir Sampling" +date: 2024-08-02T18:30:56+01:00 +draft: false # I don't care for draft mode, git has branches for that +description: "Elegantly sampling a stream" +tags: + - algorithms + - python +categories: + - programming +series: +- Cool algorithms +favorite: false +disable_feed: false +mathjax: true +--- + +[_Reservoir Sampling_][reservoir] is an [online][online], probabilistic +algorithm to uniformly sample $k$ random elements out of a stream of values. + +It's a particularly elegant and small algorithm, only requiring $\Theta(k)$ +amount of space and a single pass through the stream. + +[reservoir]: https://en.wikipedia.org/wiki/Reservoir_sampling +[online]: https://en.wikipedia.org/wiki/Online_algorithm + + From b5d3086542d89ed3feac3451b2ea171b2f4c241f Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Fri, 2 Aug 2024 21:04:49 +0100 Subject: [PATCH 4/5] posts: reservoir-sampling: add one-element sample --- .../2024-08-02-reservoir-sampling/index.md | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/content/posts/2024-08-02-reservoir-sampling/index.md b/content/posts/2024-08-02-reservoir-sampling/index.md index 795c6c1..2196d0e 100644 --- a/content/posts/2024-08-02-reservoir-sampling/index.md +++ b/content/posts/2024-08-02-reservoir-sampling/index.md @@ -25,3 +25,59 @@ amount of space and a single pass through the stream. [online]: https://en.wikipedia.org/wiki/Online_algorithm + +## Sampling one element + +As an introduction, we'll first focus on fairly sampling one element from the +stream. + +```python +def sample_one[T](stream: Iterable[T]) -> T: + stream_iter = iter(stream) + # Sample the first element + res = next(stream_iter) + for i, val in enumerate(stream_iter, start=1): + j = random.randint(0, i) + # Replace the sampled element with probability 1/(i + 1) + if j == 0: + res = val + # Return the randomly sampled element + return res +``` + +### Proof + +Let's now prove that this algorithm leads to a fair sampling of the stream. + +We'll be doing proof by induction. + +#### Hypothesis $H_N$ + +After iterating through the first $N$ items in the stream, +each of them has had an equal $\frac{1}{N}$ probability of being selected as +`res`. + +#### Base Case $H_1$ + +We can trivially observe that the first element is always assigned to `res`, +$\frac{1}{1} = 1$, the hypothesis has been verified. + +#### Inductive Case + +For a given $N$, let us assume that $H_N$ holds. Let us now look at the events +of loop iteration where `i = N` (i.e: observation of the $N + 1$-th item in the +stream). + +`j = random.randint(0, i)` uniformly selects a value in the range $[0, i]$, +a.k.a $[0, N]$. We then have two cases: + +* `j == 0`, with probability $\frac{1}{N + 1}$: we select `val` as the new +reservoir element `res`. + +* `j != 0`, with probability $\frac{N}{N + 1}$: we keep the previous value of +`res`. By $H_N$, any of the first $N$ elements had a $\frac{1}{N}$ probability +of being `res` before at the start of the loop, each element now has a +probability $\frac{1}{N} \cdot \frac{N}{N + 1} = \frac{1}{N + 1}$ of being the +element. + +And thus, we have proven $H_{N + 1}$ at the end of the loop. From d9daa689026c5cbf871a3907f785c4566040e564 Mon Sep 17 00:00:00 2001 From: Bruno BELANYI Date: Fri, 2 Aug 2024 21:10:39 +0100 Subject: [PATCH 5/5] posts: reservoir-sampling: add k-element sampling --- .../2024-08-02-reservoir-sampling/index.md | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/content/posts/2024-08-02-reservoir-sampling/index.md b/content/posts/2024-08-02-reservoir-sampling/index.md index 2196d0e..cd8cf30 100644 --- a/content/posts/2024-08-02-reservoir-sampling/index.md +++ b/content/posts/2024-08-02-reservoir-sampling/index.md @@ -81,3 +81,65 @@ probability $\frac{1}{N} \cdot \frac{N}{N + 1} = \frac{1}{N + 1}$ of being the element. And thus, we have proven $H_{N + 1}$ at the end of the loop. + +## Sampling $k$ element + +The code for sampling $k$ elements is very similar to the one-element case. + +```python +def sample[T](stream: Iterable[T], k: int = 1) -> list[T]: + stream_iter = iter(stream) + # Retain the first 'k' elements in the reservoir + res = list(itertools.islice(stream_iter, k)) + for i, val in enumerate(stream_iter, start=k): + j = random.randint(0, i) + # Replace one element at random with probability k/(i + 1) + if j < k: + res[j] = val + # Return 'k' randomly sampled elements + return res +``` + +### Proof + +Let us once again do a proof by induction, assuming the stream contains at least +$k$ items. + +#### Hypothesis $H_N$ + +After iterating through the first $N$ items in the stream, each of them has had +an equal $\frac{k}{N}$ probability of being sampled from the stream. + +#### Base Case $H_k$ + +We can trivially observe that the first $k$ element are sampled at the start of +the algorithm, $\frac{k}{k} = 1$, the hypothesis has been verified. + +#### Inductive Case + +For a given $N$, let us assume that $H_N$ holds. Let us now look at the events +of the loop iteration where `i = N`, in order to prove $H_{N + 1}$. + +`j = random.randint(0, i)` uniformly selects a value in the range $[0, i]$, +a.k.a $[0, N]$. We then have three cases: + +* `j >= k`, with probability $1 - \frac{k}{N + 1}$: we do not modify the +sampled reservoir at all. + +* `j < k`, with probability $\frac{k}{N + 1}$: we sample the new element to +replace the `j`-th element of the reservoir. Therefore for any element +$e \in [0, k[$ we can either have: + * $j = e$: the element _is_ replaced, probability $\frac{1}{k}$. + * $j \neq e$: the element is _not_ replaced, probability $\frac{k - 1}{k}$. + +We can now compute the probability that a previously sampled element is kept in +the reservoir: +$1 - \frac{k}{N + 1} + \frac{k}{N + 1} \cdot \frac{k - 1}{k} = \frac{N}{N + 1}$. + +By $H_N$, any of the first $N$ elements had a $\frac{k}{N}$ probability +of being sampled before at the start of the loop, each element now has a +probability $\frac{k}{N} \cdot \frac{N}{N + 1} = \frac{k}{N + 1}$ of being the +element. + +We have now proven that all elements have a probability $\frac{k}{N + 1}$ of +being sampled at the end of the loop, therefore $H_{N + 1}$ has been verified.