手写一个布隆过滤器：从原理到工业级实现

张

张建站

2026/4/28 15:36:28

10分钟阅读

前言你有没有想过在几十亿条数据中如何快速判断一个元素是否存在· 用哈希表内存不够10亿条要几十GB· 用数据库太慢每次都要磁盘IO答案是布隆过滤器。今天我们手写一个工业级的布隆过滤器· 极低内存占用10亿条只要1GB左右· 支持添加和查询· 支持持久化和合并· 支持误判率调优---一、布隆过滤器的核心原理1. 布隆过滤器长什么样┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐比特数组│ 0 │ 0 │ 1 │ 0 │ 0 │ 1 │ 0 │ 1 │ 0 │ 0 │ ...└───┴───┴───┴───┴───┴───┴───┴───┴───┴───┘↑ ↑ ↑│ │ │hash1 hash2 hash3(3) (5) (7)2. 基本原理添加元素1. 用k个哈希函数计算k个位置2. 把这k个位置都设为1查询元素1. 用同样的k个哈希函数计算k个位置2. 如果所有位置都是1 → 可能存在因为可能误判3. 如果有任何一个位置是0 → 一定不存在3. 为什么会误判添加元素A把位置[3, 5, 7]设为1添加元素B把位置[5, 7, 9]设为1查询元素C没添加过hash得到[3, 7, 9]→ 3是17是19是1 → 误判为存在 ❌4. 核心公式参数含义关系n 预计插入元素个数 -m 比特数组长度 m越大误判率越低k 哈希函数个数 k m/n * ln2 ≈ 0.693 * m/np 误判率 p ≈ (1 - e^(-kn/m))^k已知n和p求m和k· m -n * ln(p) / (ln2)^2· k m/n * ln2---二、完整代码实现1. 基础结构定义c#include stdio.h#include stdlib.h#include string.h#include math.h#include stdint.h#include pthread.h// 布隆过滤器结构typedef struct {uint8_t *bits; // 比特数组uint64_t size; // 比特数组大小位数uint64_t bytes; // 字节数int hash_count; // 哈希函数个数uint64_t count; // 已插入元素个数pthread_mutex_t mutex; // 互斥锁} bloom_filter_t;2. 初始化计算c// 根据预期元素数量和误判率计算参数void bloom_calc_params(uint64_t expected_elements, double false_positive_rate,uint64_t *bits_size, int *hash_count) {// 计算公式m -n * ln(p) / (ln2)^2double ln2 0.6931471805599453;double ln2_sq ln2 * ln2;*bits_size (uint64_t)(-expected_elements * log(false_positive_rate) / ln2_sq);// 计算公式k m/n * ln2*hash_count (int)((*bits_size / expected_elements) * ln2);if (*hash_count 1) *hash_count 1;if (*hash_count 30) *hash_count 30;}// 创建布隆过滤器bloom_filter_t *bloom_create(uint64_t expected_elements, double false_positive_rate) {if (expected_elements 0 || false_positive_rate 0 || false_positive_rate 1) {return NULL;}bloom_filter_t *bf malloc(sizeof(bloom_filter_t));if (!bf) return NULL;// 计算参数bloom_calc_params(expected_elements, false_positive_rate, bf-size, bf-hash_count);// 对齐到8字节bf-bytes (bf-size 7) / 8;bf-bits calloc(bf-bytes, sizeof(uint8_t));if (!bf-bits) {free(bf);return NULL;}bf-count 0;pthread_mutex_init(bf-mutex, NULL);printf(布隆过滤器创建: size%llu bits (%llu bytes), hash_count%d\n,(unsigned long long)bf-size, (unsigned long long)bf-bytes, bf-hash_count);return bf;}// 销毁布隆过滤器void bloom_destroy(bloom_filter_t *bf) {if (!bf) return;free(bf-bits);pthread_mutex_destroy(bf-mutex);free(bf);}3. 哈希函数实现c// MurmurHash3 的简化版本32位uint32_t murmur_hash3(const void *key, int len, uint32_t seed) {const uint8_t *data (const uint8_t*)key;const int nblocks len / 4;uint32_t h1 seed;const uint32_t c1 0xcc9e2d51;const uint32_t c2 0x1b873593;// 处理4字节块const uint32_t *blocks (const uint32_t*)(data nblocks * 4);for (int i -nblocks; i; i) {uint32_t k1 blocks[i];k1 * c1;k1 (k1 15) | (k1 17);k1 * c2;h1 ^ k1;h1 (h1 13) | (h1 19);h1 h1 * 5 0xe6546b64;}// 处理剩余字节const uint8_t *tail (const uint8_t*)(data nblocks * 4);uint32_t k1 0;switch (len 3) {case 3: k1 ^ tail[2] 16;case 2: k1 ^ tail[1] 8;case 1: k1 ^ tail[0];k1 * c1;k1 (k1 15) | (k1 17);k1 * c2;h1 ^ k1;}// 最终混合h1 ^ len;h1 ^ h1 16;h1 * 0x85ebca6b;h1 ^ h1 13;h1 * 0xc2b2ae35;h1 ^ h1 16;return h1;}// 双哈希法生成k个位置void bloom_get_positions(bloom_filter_t *bf, const void *data, int len,uint64_t *positions) {uint32_t hash1 murmur_hash3(data, len, 0);uint32_t hash2 murmur_hash3(data, len, hash1);for (int i 0; i bf-hash_count; i) {positions[i] (hash1 i * hash2) % bf-size;}}4. 添加和查询c// 添加元素int bloom_add(bloom_filter_t *bf, const void *data, int len) {if (!bf || !data) return -1;uint64_t positions[bf-hash_count];bloom_get_positions(bf, data, len, positions);pthread_mutex_lock(bf-mutex);for (int i 0; i bf-hash_count; i) {uint64_t byte_idx positions[i] / 8;uint64_t bit_idx positions[i] % 8;bf-bits[byte_idx] | (1 bit_idx);}bf-count;pthread_mutex_unlock(bf-mutex);return 0;}// 查询元素1可能存在0一定不存在int bloom_check(bloom_filter_t *bf, const void *data, int len) {if (!bf || !data) return 0;uint64_t positions[bf-hash_count];bloom_get_positions(bf, data, len, positions);pthread_mutex_lock(bf-mutex);for (int i 0; i bf-hash_count; i) {uint64_t byte_idx positions[i] / 8;uint64_t bit_idx positions[i] % 8;if (!(bf-bits[byte_idx] (1 bit_idx))) {pthread_mutex_unlock(bf-mutex);return 0; // 一定不存在}}pthread_mutex_unlock(bf-mutex);return 1; // 可能存在}5. 字符串封装c// 字符串添加int bloom_add_str(bloom_filter_t *bf, const char *str) {return bloom_add(bf, str, strlen(str));}// 字符串查询int bloom_check_str(bloom_filter_t *bf, const char *str) {return bloom_check(bf, str, strlen(str));}// 整数添加int bloom_add_int(bloom_filter_t *bf, int64_t value) {return bloom_add(bf, value, sizeof(value));}// 整数查询int bloom_check_int(bloom_filter_t *bf, int64_t value) {return bloom_check(bf, value, sizeof(value));}6. 持久化和合并c// 保存到文件int bloom_save(bloom_filter_t *bf, const char *filename) {if (!bf || !filename) return -1;FILE *fp fopen(filename, wb);if (!fp) return -1;// 写入元数据fwrite(bf-size, sizeof(bf-size), 1, fp);fwrite(bf-hash_count, sizeof(bf-hash_count), 1, fp);fwrite(bf-count, sizeof(bf-count), 1, fp);// 写入比特数组fwrite(bf-bits, 1, bf-bytes, fp);fclose(fp);return 0;}// 从文件加载bloom_filter_t *bloom_load(const char *filename) {if (!filename) return NULL;FILE *fp fopen(filename, rb);if (!fp) return NULL;bloom_filter_t *bf malloc(sizeof(bloom_filter_t));if (!bf) {fclose(fp);return NULL;}// 读取元数据fread(bf-size, sizeof(bf-size), 1, fp);fread(bf-hash_count, sizeof(bf-hash_count), 1, fp);fread(bf-count, sizeof(bf-count), 1, fp);bf-bytes (bf-size 7) / 8;bf-bits malloc(bf-bytes);if (!bf-bits) {free(bf);fclose(fp);return NULL;}fread(bf-bits, 1, bf-bytes, fp);pthread_mutex_init(bf-mutex, NULL);fclose(fp);return bf;}// 合并两个布隆过滤器OR操作int bloom_merge(bloom_filter_t *dest, bloom_filter_t *src) {if (!dest || !src) return -1;if (dest-size ! src-size || dest-hash_count ! src-hash_count) {return -1; // 参数不匹配}pthread_mutex_lock(dest-mutex);pthread_mutex_lock(src-mutex);for (uint64_t i 0; i dest-bytes; i) {dest-bits[i] | src-bits[i];}dest-count src-count;pthread_mutex_unlock(src-mutex);pthread_mutex_unlock(dest-mutex);return 0;}7. 统计和信息c// 获取当前误判率估算double bloom_current_false_positive_rate(bloom_filter_t *bf) {if (!bf || bf-count 0) return 0;// p ≈ (1 - e^(-kn/m))^kdouble ratio (double)bf-count / bf-size;double e_pow exp(-bf-hash_count * ratio);double p pow(1 - e_pow, bf-hash_count);return p;}// 获取填充率有多少比特是1double bloom_fill_ratio(bloom_filter_t *bf) {if (!bf) return 0;uint64_t ones 0;for (uint64_t i 0; i bf-bytes; i) {uint8_t byte bf-bits[i];while (byte) {ones byte 1;byte 1;}}return (double)ones / bf-size;}// 获取信息void bloom_info(bloom_filter_t *bf) {if (!bf) return;printf( 布隆过滤器信息 \n);printf(比特数组大小: %llu bits (%llu bytes)\n,(unsigned long long)bf-size, (unsigned long long)bf-bytes);printf(哈希函数个数: %d\n, bf-hash_count);printf(已插入元素: %llu\n, (unsigned long long)bf-count);printf(填充率: %.2f%%\n, bloom_fill_ratio(bf) * 100);printf(预估误判率: %.6f%%\n, bloom_current_false_positive_rate(bf) * 100);}---三、测试代码基础功能测试c#include time.hint main() {printf( 布隆过滤器基础测试 \n\n);// 创建布隆过滤器预计100万元素1%误判率bloom_filter_t *bf bloom_create(1000000, 0.01);// 添加测试数据printf(添加10万条数据...\n);for (int i 0; i 100000; i) {char key[32];sprintf(key, user_%d, i);bloom_add_str(bf, key);}bloom_info(bf);// 查询存在的数据printf(\n 存在性测试 \n);int found 0;for (int i 0; i 10000; i) {char key[32];sprintf(key, user_%d, i);if (bloom_check_str(bf, key)) {found;}}printf(查询10000条已存在数据: 发现 %d 条\n, found);// 查询不存在的数据测量误判率printf(\n 误判率测试 \n);int false_positive 0;int test_count 100000;for (int i 100000; i 100000 test_count; i) {char key[32];sprintf(key, user_%d, i);if (bloom_check_str(bf, key)) {false_positive;}}printf(查询%d条不存在数据: 误判 %d 条\n, test_count, false_positive);printf(实测误判率: %.4f%%\n, (double)false_positive / test_count * 100);// 持久化测试printf(\n 持久化测试 \n);bloom_save(bf, bloom.dat);printf(已保存到 bloom.dat\n);bloom_destroy(bf);// 重新加载bf bloom_load(bloom.dat);if (bf) {printf(加载成功\n);bloom_info(bf);// 验证数据int verify 0;for (int i 0; i 100; i) {char key[32];sprintf(key, user_%d, i);if (bloom_check_str(bf, key)) {verify;}}printf(验证前100条: 发现 %d 条\n, verify);}bloom_destroy(bf);return 0;}内存和性能测试cvoid test_performance() {printf(\n 性能测试 \n);// 测试不同规模的布隆过滤器uint64_t test_sizes[] {10000, 100000, 1000000, 10000000};for (int s 0; s 4; s) {uint64_t n test_sizes[s];printf(\n--- 测试规模: %llu 元素 ---\n, (unsigned long long)n);// 创建布隆过滤器bloom_filter_t *bf bloom_create(n, 0.01);// 插入性能clock_t start clock();for (uint64_t i 0; i n; i) {char key[32];sprintf(key, key_%llu, (unsigned long long)i);bloom_add_str(bf, key);}clock_t end clock();printf(插入 %llu 条: %.2f 秒 (%.2f μs/条)\n,(unsigned long long)n,(double)(end - start) / CLOCKS_PER_SEC,(double)(end - start) / n * 1000000 / CLOCKS_PER_SEC);// 查询性能start clock();for (uint64_t i 0; i n; i) {char key[32];sprintf(key, key_%llu, (unsigned long long)i);bloom_check_str(bf, key);}end clock();printf(查询 %llu 条: %.2f 秒 (%.2f μs/条)\n,(unsigned long long)n,(double)(end - start) / CLOCKS_PER_SEC,(double)(end - start) / n * 1000000 / CLOCKS_PER_SEC);// 内存占用printf(内存占用: %.2f MB\n, bf-bytes / 1024.0 / 1024.0);bloom_destroy(bf);}}运行结果示例元素数量内存占用插入速度查询速度实测误判率10万 0.12 MB 0.15秒 0.12秒 0.9%100万 1.20 MB 1.8秒 1.5秒 0.98%1000万 12.0 MB 19秒 16秒 0.99%1亿 120 MB 200秒 170秒约1%---四、工业级应用场景场景1缓存穿透防护c// 缓存穿透防护示例typedef struct {bloom_filter_t *bf;redis_t *redis;mysql_t *mysql;} cache_system_t;void *cache_query(cache_system_t *cs, const char *key) {// 1. 先查布隆过滤器if (!bloom_check_str(cs-bf, key)) {return NULL; // 一定不存在直接返回}// 2. 查缓存void *val redis_get(cs-redis, key);if (val) return val;// 3. 查数据库val mysql_query(cs-mysql, key);if (val) {redis_set(cs-redis, key, val);}return val;}场景2爬虫URL去重c// 亿级URL去重typedef struct {bloom_filter_t *bf;uint64_t visited_count;} url_dedup_t;int is_url_visited(url_dedup_t *du, const char *url) {if (bloom_check_str(du-bf, url)) {// 可能存在需要二次确认如查数据库return 1;}// 一定未访问过bloom_add_str(du-bf, url);return 0;}场景3黑名单过滤c// 垃圾邮件过滤bloom_filter_t *spam_bf;bloom_filter_t *ham_bf; // 误判补救用int is_spam(const char *email) {if (bloom_check_str(spam_bf, email)) {// 可能是垃圾邮件需要进一步检查if (!bloom_check_str(ham_bf, email)) {return 1; // 确认为垃圾邮件}}return 0;}---五、布隆过滤器的变种变种特点适用场景计数布隆过滤器支持删除动态集合可扩展布隆过滤器动态扩容元素数量未知布谷鸟过滤器支持删除空间更优需要删除的场景分区布隆过滤器可并行查询大规模分布式计数布隆过滤器支持删除ctypedef struct {uint8_t *counters; // 计数器数组uint64_t size;int hash_count;} counting_bloom_t;int counting_bloom_add(counting_bloom_t *cbf, const void *data, int len) {uint64_t positions[cbf-hash_count];bloom_get_positions(cbf-size, data, len, cbf-hash_count, positions);for (int i 0; i cbf-hash_count; i) {cbf-counters[positions[i]];}return 0;}int counting_bloom_remove(counting_bloom_t *cbf, const void *data, int len) {uint64_t positions[cbf-hash_count];bloom_get_positions(cbf-size, data, len, cbf-hash_count, positions);for (int i 0; i cbf-hash_count; i) {if (cbf-counters[positions[i]] 0) {cbf-counters[positions[i]]--;}}return 0;}---六、常见问题和调优1. 误判率太高怎么办· 增大布隆过滤器的尺寸增加m· 增加哈希函数个数增加k· 或者降低预期元素数量2. 元素数量远超预期怎么办· 使用可扩展布隆过滤器· 或者重建一个更大的把旧数据迁移过去3. 哈希函数怎么选推荐组合· MurmurHash3速度快分布均匀· CityHashGoogle出品· xxHash极快4. 参数计算公式c// 根据预期元素和误判率计算最优参数void bloom_optimal_params(uint64_t n, double p,uint64_t *m, int *k) {double ln2 0.6931471805599453;*m (uint64_t)(-n * log(p) / (ln2 * ln2));*k (int)((*m / n) * ln2);if (*k 1) *k 1;if (*k 20) *k 20;}---七、总结通过这篇文章你学会了· 布隆过滤器的核心原理比特数组多哈希函数· 完整的工业级实现支持持久化、合并、统计· 内存和性能的权衡1%误判率下每元素约10bit· 缓存穿透防护、URL去重等实战应用布隆过滤器是大数据场景下的必备工具。掌握它你就掌握了一种用极小内存换取极大效率的思维方式。下一篇预告《一致性哈希分布式缓存的核心算法》---评论区分享一下你想用布隆过滤器解决什么问题