From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 2121166F87 for ; Wed, 29 Jul 2020 09:58:58 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 15F2BB5DD for ; Wed, 29 Jul 2020 09:58:58 +0200 (CEST) Received: from elsa.proxmox.com (212-186-127-178.static.upcbusiness.at [212.186.127.178]) by firstgate.proxmox.com (Proxmox) with ESMTP id 2A7CBB5CB for ; Wed, 29 Jul 2020 09:58:57 +0200 (CEST) Received: by elsa.proxmox.com (Postfix, from userid 0) id 0842FAE204A; Wed, 29 Jul 2020 09:58:57 +0200 (CEST) From: Dietmar Maurer To: pbs-devel@lists.proxmox.com Date: Wed, 29 Jul 2020 09:58:55 +0200 Message-Id: <20200729075855.10470-1-dietmar@proxmox.com> X-Mailer: git-send-email 2.20.1 MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL -1.111 Adjusted score from AWL reputation of From: address KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [wikipedia.org, tools.rs] Subject: [pbs-devel] [RFC backup] Bloom filter for chunks X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 29 Jul 2020 07:58:58 -0000 --- I just want to share this idea, but I have no real use case for now... Cargo.toml | 1 + src/tools.rs | 1 + src/tools/chunk_bloom_filter.rs | 66 +++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 src/tools/chunk_bloom_filter.rs diff --git a/Cargo.toml b/Cargo.toml index 2306395..5268ec1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,6 +62,7 @@ walkdir = "2" xdg = "2.2" zstd = { version = "0.4", features = [ "bindgen" ] } nom = "5.1" +bit-vec = "0.5" [features] default = [] diff --git a/src/tools.rs b/src/tools.rs index 44db796..f6656f9 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -35,6 +35,7 @@ pub mod timer; pub mod statistics; pub mod systemd; pub mod nom; +pub mod chunk_bloom_filter; mod wrapped_reader_stream; pub use wrapped_reader_stream::*; diff --git a/src/tools/chunk_bloom_filter.rs b/src/tools/chunk_bloom_filter.rs new file mode 100644 index 0000000..32a43c0 --- /dev/null +++ b/src/tools/chunk_bloom_filter.rs @@ -0,0 +1,66 @@ +use bit_vec::BitVec; + +/// Bloom filter for chunks +/// +/// This Bloom filter directly uses the chunk digest as hash, split +/// into 8 parts. +/// +/// formulas from https://en.wikipedia.org/wiki/Bloom_filter: +/// +/// k = (m/n) * ln_2 ==> m = (k*n)/ln_2 +/// +/// We use k=8, so false positives rate should be around 2% +/// +/// Memory usage in bytes = expected_entries*1.44 +pub struct ChunkBloomFilter { + bits: BitVec, +} + +impl ChunkBloomFilter { + + fn needed_bits(expected_entries: usize) -> usize { + const K: usize = 8; + + (((K * expected_entries) as f64)/core::f64::consts::LN_2) as usize + } + + pub fn new(expected_entries: usize) -> Self { + let n = Self::needed_bits(expected_entries); + Self { + bits: BitVec::with_capacity(n) + } + } + + pub fn insert(&mut self, digest: &[u8; 32]) -> bool { + + let hashes: &[u32; 8] = unsafe { std::mem::transmute(digest) }; + + let mut contained = true; + + for i in 0..8 { + let index = (hashes[i] as usize) % self.bits.len(); + if !self.bits.get(index).unwrap() { + self.bits.set(index, true); + contained = false; + } + } + + contained + } + + pub fn contains(&mut self, digest: &[u8; 32]) -> bool { + + let hashes: &[u32; 8] = unsafe { std::mem::transmute(digest) }; + + let mut contained = true; + + for i in 0..8 { + let index = (hashes[i] as usize) % self.bits.len(); + if !self.bits.get(index).unwrap() { + contained = false; + } + } + + contained + } +} -- 2.20.1