From: Dietmar Maurer <dietmar@proxmox.com>
To: pbs-devel@lists.proxmox.com
Subject: [pbs-devel] [RFC backup] Bloom filter for chunks
Date: Wed, 29 Jul 2020 09:58:55 +0200 [thread overview]
Message-ID: <20200729075855.10470-1-dietmar@proxmox.com> (raw)
---
I just want to share this idea, but I have no real use case for now...
Cargo.toml | 1 +
src/tools.rs | 1 +
src/tools/chunk_bloom_filter.rs | 66 +++++++++++++++++++++++++++++++++
3 files changed, 68 insertions(+)
create mode 100644 src/tools/chunk_bloom_filter.rs
diff --git a/Cargo.toml b/Cargo.toml
index 2306395..5268ec1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -62,6 +62,7 @@ walkdir = "2"
xdg = "2.2"
zstd = { version = "0.4", features = [ "bindgen" ] }
nom = "5.1"
+bit-vec = "0.5"
[features]
default = []
diff --git a/src/tools.rs b/src/tools.rs
index 44db796..f6656f9 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -35,6 +35,7 @@ pub mod timer;
pub mod statistics;
pub mod systemd;
pub mod nom;
+pub mod chunk_bloom_filter;
mod wrapped_reader_stream;
pub use wrapped_reader_stream::*;
diff --git a/src/tools/chunk_bloom_filter.rs b/src/tools/chunk_bloom_filter.rs
new file mode 100644
index 0000000..32a43c0
--- /dev/null
+++ b/src/tools/chunk_bloom_filter.rs
@@ -0,0 +1,66 @@
+use bit_vec::BitVec;
+
+/// Bloom filter for chunks
+///
+/// This Bloom filter directly uses the chunk digest as hash, split
+/// into 8 parts.
+///
+/// formulas from https://en.wikipedia.org/wiki/Bloom_filter:
+///
+/// k = (m/n) * ln_2 ==> m = (k*n)/ln_2
+///
+/// We use k=8, so false positives rate should be around 2%
+///
+/// Memory usage in bytes = expected_entries*1.44
+pub struct ChunkBloomFilter {
+ bits: BitVec,
+}
+
+impl ChunkBloomFilter {
+
+ fn needed_bits(expected_entries: usize) -> usize {
+ const K: usize = 8;
+
+ (((K * expected_entries) as f64)/core::f64::consts::LN_2) as usize
+ }
+
+ pub fn new(expected_entries: usize) -> Self {
+ let n = Self::needed_bits(expected_entries);
+ Self {
+ bits: BitVec::with_capacity(n)
+ }
+ }
+
+ pub fn insert(&mut self, digest: &[u8; 32]) -> bool {
+
+ let hashes: &[u32; 8] = unsafe { std::mem::transmute(digest) };
+
+ let mut contained = true;
+
+ for i in 0..8 {
+ let index = (hashes[i] as usize) % self.bits.len();
+ if !self.bits.get(index).unwrap() {
+ self.bits.set(index, true);
+ contained = false;
+ }
+ }
+
+ contained
+ }
+
+ pub fn contains(&mut self, digest: &[u8; 32]) -> bool {
+
+ let hashes: &[u32; 8] = unsafe { std::mem::transmute(digest) };
+
+ let mut contained = true;
+
+ for i in 0..8 {
+ let index = (hashes[i] as usize) % self.bits.len();
+ if !self.bits.get(index).unwrap() {
+ contained = false;
+ }
+ }
+
+ contained
+ }
+}
--
2.20.1
reply other threads:[~2020-07-29 7:58 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200729075855.10470-1-dietmar@proxmox.com \
--to=dietmar@proxmox.com \
--cc=pbs-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.