all lists on lists.proxmox.com
 help / color / mirror / Atom feed
From: Dietmar Maurer <dietmar@proxmox.com>
To: pbs-devel@lists.proxmox.com
Subject: [pbs-devel] [RFC backup] Bloom filter for chunks
Date: Wed, 29 Jul 2020 09:58:55 +0200	[thread overview]
Message-ID: <20200729075855.10470-1-dietmar@proxmox.com> (raw)

---

 I just want to share this idea, but I have no real use case for now...

 Cargo.toml                      |  1 +
 src/tools.rs                    |  1 +
 src/tools/chunk_bloom_filter.rs | 66 +++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+)
 create mode 100644 src/tools/chunk_bloom_filter.rs

diff --git a/Cargo.toml b/Cargo.toml
index 2306395..5268ec1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -62,6 +62,7 @@ walkdir = "2"
 xdg = "2.2"
 zstd = { version = "0.4", features = [ "bindgen" ] }
 nom = "5.1"
+bit-vec = "0.5"
 
 [features]
 default = []
diff --git a/src/tools.rs b/src/tools.rs
index 44db796..f6656f9 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -35,6 +35,7 @@ pub mod timer;
 pub mod statistics;
 pub mod systemd;
 pub mod nom;
+pub mod chunk_bloom_filter;
 
 mod wrapped_reader_stream;
 pub use wrapped_reader_stream::*;
diff --git a/src/tools/chunk_bloom_filter.rs b/src/tools/chunk_bloom_filter.rs
new file mode 100644
index 0000000..32a43c0
--- /dev/null
+++ b/src/tools/chunk_bloom_filter.rs
@@ -0,0 +1,66 @@
+use bit_vec::BitVec;
+
+/// Bloom filter for chunks
+///
+/// This Bloom filter directly uses the chunk digest as hash, split
+/// into 8 parts.
+///
+/// formulas from https://en.wikipedia.org/wiki/Bloom_filter:
+///
+/// k = (m/n) * ln_2 ==> m = (k*n)/ln_2
+///
+/// We use k=8, so false positives rate should be around 2%
+///
+/// Memory usage in bytes = expected_entries*1.44
+pub struct ChunkBloomFilter {
+    bits: BitVec,
+}
+
+impl ChunkBloomFilter {
+
+    fn needed_bits(expected_entries: usize) -> usize {
+        const K: usize = 8;
+
+        (((K * expected_entries) as f64)/core::f64::consts::LN_2) as usize
+    }
+
+    pub fn new(expected_entries: usize) -> Self {
+        let n = Self::needed_bits(expected_entries);
+        Self {
+            bits: BitVec::with_capacity(n)
+        }
+    }
+
+    pub fn insert(&mut self, digest: &[u8; 32]) -> bool {
+
+        let hashes: &[u32; 8] = unsafe { std::mem::transmute(digest) };
+
+        let mut contained = true;
+
+        for i in 0..8 {
+            let index = (hashes[i] as usize) % self.bits.len();
+            if !self.bits.get(index).unwrap() {
+                self.bits.set(index, true);
+                contained = false;
+            }
+        }
+
+        contained
+    }
+
+    pub fn contains(&mut self, digest: &[u8; 32]) -> bool {
+
+        let hashes: &[u32; 8] = unsafe { std::mem::transmute(digest) };
+
+        let mut contained = true;
+
+        for i in 0..8 {
+            let index = (hashes[i] as usize) % self.bits.len();
+            if !self.bits.get(index).unwrap() {
+                contained = false;
+            }
+        }
+
+        contained
+    }
+}
-- 
2.20.1




                 reply	other threads:[~2020-07-29  7:58 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200729075855.10470-1-dietmar@proxmox.com \
    --to=dietmar@proxmox.com \
    --cc=pbs-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal