首页 > 代码库 > Bloom filter的实现以及常用的hash函数

Bloom filter的实现以及常用的hash函数

Bloom filter的实现以及常用的hash函数

bloom filter利用时间换空间的思想,利用多个哈希函数,将一个元素的存在状态映射到多个bit中,特别是在网络环境中,BF具有广泛的用途,关键问题就是要减少false positive rate(可以设置参数来调节),扩展有 counting BF。这里选用的hash函数是表现较好的 BKDRHash , SDBMHash, DJBHash 。

Bloom-filter代码:
bloom_filter.h
#ifndef __BLOOM_FILTER_H__
#define __BLOOM_FILTER_H__

#include<stdlib.h>

typedef unsigned int (*hashfunc_t)(const char *, unsigned int len);

typedef struct {
    size_t asize;
    unsigned char *a; //to store the state of existence bits 
    size_t nfuncs;
    hashfunc_t *funcs; // hash funcs
} BLOOM;

BLOOM *bloom_create(size_t size, size_t nfuncs, ...);
int bloom_destroy(BLOOM *bloom);
int bloom_add(BLOOM *bloom, const char *s, unsigned int len);
int bloom_check(BLOOM *bloom, const char *s, unsigned int len);

#endif

bloom_filter.c
#include<limits.h>
#include<stdarg.h>

#include"bloom_filter.h"

#define SETBIT(a, n) (a[n/CHAR_BIT] |= (1<<(n%CHAR_BIT)))
#define GETBIT(a, n) (a[n/CHAR_BIT] & (1<<(n%CHAR_BIT)))

BLOOM *bloom_create(size_t size, size_t nfuncs, ...)
{
    BLOOM *bloom;
    va_list l;
    int n;

    if(!(bloom=malloc(sizeof(BLOOM)))) return NULL;

    // ceil the number of char to malloc for the bloom
    if(!(bloom->a=calloc((size+CHAR_BIT-1)/CHAR_BIT, sizeof(char)))) {
        free(bloom);
        return NULL;
    }
    if(!(bloom->funcs=(hashfunc_t*)malloc(nfuncs*sizeof(hashfunc_t)))) {
        free(bloom->a);
        free(bloom);
        return NULL;
    }

    va_start(l, nfuncs);
    for(n=0; n<nfuncs; ++n) {
        bloom->funcs[n]=va_arg(l, hashfunc_t);
    }
    va_end(l);

    bloom->nfuncs=nfuncs;
    bloom->asize=size;

    return bloom;
}

int bloom_destroy(BLOOM *bloom)
{
    free(bloom->a);
    free(bloom->funcs);
    free(bloom);

    return 0;
}

int bloom_add(BLOOM *bloom, const char *s, unsigned int len)
{
    size_t n;

    for(n=0; n<bloom->nfuncs; ++n) {
        SETBIT(bloom->a, bloom->funcs[n](s, len)%bloom->asize);
    }

    return 0;
}

int bloom_check(BLOOM *bloom, const char *s, unsigned int len)
{
    size_t n;

    for(n=0; n<bloom->nfuncs; ++n) {
        if(!(GETBIT(bloom->a, bloom->funcs[n](s, len)%bloom->asize))) return 0;
    }

    return 1;
}


常用hash函数文件:
#include "global.h"

// Author: Arash Partow - 2002 

unsigned int RSHash(char* str, unsigned int len)
{
   unsigned int b    = 378551;
   unsigned int a    = 63689;
   unsigned int hash = 0;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = hash * a + (*str);
      a    = a * b;
   }

   return hash;
}
/* End Of RS Hash Function */


unsigned int JSHash(char* str, unsigned int len)
{
   unsigned int hash = 1315423911;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash ^= ((hash << 5) + (*str) + (hash >> 2));
   }

   return hash;
}
/* End Of JS Hash Function */


unsigned int PJWHash(char* str, unsigned int len)
{
   const unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8);
   const unsigned int ThreeQuarters     = (unsigned int)((BitsInUnsignedInt  * 3) / 4);
   const unsigned int OneEighth         = (unsigned int)(BitsInUnsignedInt / 8);
   const unsigned int HighBits          = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
   unsigned int hash              = 0;
   unsigned int test              = 0;
   unsigned int i                 = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = (hash << OneEighth) + (*str);

      if((test = hash & HighBits)  != 0)
      {
         hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits));
      }
   }

   return hash;
}
/* End Of  P. J. Weinberger Hash Function */


unsigned int ELFHash(char* str, unsigned int len)
{
   unsigned int hash = 0;
   unsigned int x    = 0;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = (hash << 4) + (*str);
      if((x = hash & 0xF0000000L) != 0)
      {
         hash ^= (x >> 24);
      }
      hash &= ~x;
   }

   return hash;
}
/* End Of ELF Hash Function */

/*This hash function comes from Brian Kernighan and Dennis Ritchie's book "The C Programming Language". 
* It is a simple hash function using a strange set of possible seeds which all constitute a pattern 
* of 31....31...31 etc, it seems to be very similar to the DJB hash function.
*/
unsigned int BKDRHash(char* str, unsigned int len)
{
   unsigned int seed = 131; /* 31 131 1313 13131 131313 etc.. */
   unsigned int hash = 0;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = (hash * seed) + (*str);
   }

   return hash;
}
/* End Of BKDR Hash Function */

/*This is the algorithm of choice which is used in the open source SDBM project. 
* The hash function seems to have a good over-all distribution for many different 
* data sets. It seems to work well in situations where there is a high variance 
* in the MSBs of the elements in a data set.
*/
unsigned int SDBMHash(char* str, unsigned int len)
{
   unsigned int hash = 0;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = (*str) + (hash << 6) + (hash << 16) - hash;
   }

   return hash;
}
/* End Of SDBM Hash Function */

/*An algorithm produced by Professor Daniel J. Bernstein and shown first 
* to the world on the usenet newsgroup comp.lang.c. It is one of the most 
* efficient hash functions ever published.
*/
unsigned int DJBHash(char* str, unsigned int len)
{
   unsigned int hash = 5381;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = ((hash << 5) + hash) + (*str);
   }

   return hash;
}
/* End Of DJB Hash Function */


unsigned int DEKHash(char* str, unsigned int len)
{
   unsigned int hash = len;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = ((hash << 5) ^ (hash >> 27)) ^ (*str);
   }
   return hash;
}
/* End Of DEK Hash Function */


unsigned int BPHash(char* str, unsigned int len)
{
   unsigned int hash = 0;
   unsigned int i    = 0;
   for(i = 0; i < len; str++, i++)
   {
      hash = hash << 7 ^ (*str);
   }

   return hash;
}
/* End Of BP Hash Function */


unsigned int FNVHash(char* str, unsigned int len)
{
   const unsigned int fnv_prime = 0x811C9DC5;
   unsigned int hash      = 0;
   unsigned int i         = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash *= fnv_prime;
      hash ^= (*str);
   }

   return hash;
}
/* End Of FNV Hash Function */


unsigned int APHash(char* str, unsigned int len)
{
   unsigned int hash = 0xAAAAAAAA;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash ^= ((i & 1) == 0) ? (  (hash <<  7) ^ (*str) * (hash >> 3)) :
                               (~((hash << 11) + (*str) ^ (hash >> 5)));
   }

   return hash;
}
/* End Of AP Hash Function */



测试程序:这里是先对文件分块,而后对指纹进行BF,以达到节省空间的目的。
#include "global.h"

int main(int argc, char ** argv){
    int i = 0, id = 1, result, count=0;
    int sockfd, fd;
    FileInfo *fi=file_new();
    FingerChunk *p;
    BLOOM *bloom = NULL;

    if(argc != 2)
        err_quit("usage:bloom  <file>");

    //creat a bloom filter .TODO what size is best??
    if(!(bloom=bloom_create(2500000, 3, BKDRHash, SDBMHash, DJBHash))) {
        fprintf(stderr, "ERROR: Could not create bloom filter\n");
        return EXIT_FAILURE;
    }

    // chunk the file
    strcpy(fi->file_path, argv[1]);
    chunk_file(fi);

    printf("File size : %lld\n",fi->file_size);
    printf("Chunk Num : %d\n",fi->chunknum);

    p= fi->first;
    while(p){
        char hash[41];
        digestToHash(p->chunk_hash,hash);
        printf("chunklen : %d , Fingerprint : %s\n", p->chunklen, hash); // just 
        //add each signature to the bloom filter 
        bloom_add(bloom, p->chunk_hash, 20);  // 
        p=p->next;
    }

    //How to test the false positive ??
    p = fi->first;
    printf("1.BKDR Hash Function value : %u\n", BKDRHash(p->chunk_hash, 20));
    printf("2.SDBM Hash Function value : %u\n", SDBMHash(p->chunk_hash, 20));
    printf("3.DJB Hash Function value : %u\n", DJBHash(p->chunk_hash, 20));
    bloom_destroy(bloom);
    file_free(fi);
}

参考:
1. http://www.cnblogs.com/-clq/archive/2012/05/31/2528153.html
2. http://www.partow.net/programming/hashfunctions/#top