FastHash.h

Go to the documentation of this file.
00001 #ifndef FastHash_HEADER
00002 #define FastHash_HEADER
00003 
00004 /***************************************************************************
00005  * Benoit Hudson (C) 2007.
00006  ***************************************************************************/
00007 
00008 #ifdef HAVE_CONFIG_H
00009 # include <config.h>
00010 #endif
00011 
00012 #include <assert.h>
00013 #include <boost/utility.hpp>
00014 #include <string.h> // memset
00015 #include <utility> // for pair
00016 #include "SmallMemoryPool.h"
00017 
00018 #include "bit-hacks.h"
00019 #include "hash.h" // namespace hashers
00020 using namespace std;
00021 
00057 namespace hudson {
00058 
00061 struct FHS_Stats {
00062   size_t N;
00063   size_t sumnitems;
00064   size_t sumcapacity;
00065   size_t sumnonempty;
00066   vector<size_t> histogram;
00067   FHS_Stats() {
00068     memset(this, 0, sizeof(*this));
00069   }
00070   void print();
00071 };
00072 extern FHS_Stats global_stats;
00073 
00080 template <typename Key, typename Value> struct KeyValue {
00081   Key key_;
00082   Value value_;
00083   KeyValue(const Key& k): key_(k) { }
00084 
00085   template <typename KeyRef, typename ValueRef> struct Pair {
00086     KeyRef first;
00087     ValueRef second;
00088     Pair(KeyRef f, ValueRef s): first(f), second(s) { }
00089   };
00090   typedef Pair<Key&, Value&> ref;
00091   typedef Pair<const Key&, const Value&> const_ref;
00092   operator ref () { return ref(key_, value_); }
00093   operator const_ref() const { return const_ref(key_, value_); }
00094   operator const Key& () const { return key_; }
00095 
00096   bool operator== (const Key& k) const { return key_ == k; }
00097   bool operator!= (const Key& k) const { return !(key_ == k); }
00098 };
00099 
00106 template <typename Key> struct KeyValue<Key,void> {
00107   Key key_;
00108   KeyValue(const Key& k): key_(k) { }
00109 
00110   typedef Key& ref;
00111   typedef const Key& const_ref;
00112   operator ref () { return key_; }
00113   operator const_ref () const { return key_; } // same as const Key&
00114 
00115   bool operator== (const Key& k) const { return key_ == k; }
00116   bool operator!= (const Key& k) const { return !(key_ == k); }
00117 };
00118 
00125 template <typename Key, typename Value, class hasher, bool NeedsDelete, bool collectStats> class FastHashBase
00126 : public boost::noncopyable {
00127   /***************************************************************************
00128    ***************************************************************************/
00129   typedef hudson::KeyValue<Key,Value> KeyValue;
00130 
00131   struct link : public SmallMemoryItem<link> {
00132     KeyValue key_;
00133     link *next_;
00134     link(const Key& key): key_(key), next_(NULL) { }
00135   };
00136 
00137   public:
00138   struct iterator {
00139     typedef typename KeyValue::ref value_type;
00140 
00141     iterator(link *l): l_(l) { }
00142     iterator(const iterator& it): l_(it.l_) { }
00143     bool operator== (const iterator& other) const { return l_ == other.l_; }
00144     bool operator!= (const iterator& other) const { return l_ != other.l_; }
00145     value_type operator*() const { return value_type(this->l_->key_); }
00146 
00147     private:
00148     link *l_;
00149   };
00150   struct const_iterator {
00151     typedef typename KeyValue::const_ref value_type;
00152 
00153     const_iterator(const link *l): l_(l) { }
00154     const_iterator(const const_iterator& it): l_(it.l_) { }
00155     bool operator== (const const_iterator& other) const { return l_ == other.l_; }
00156     bool operator!= (const const_iterator& other) const { return l_ != other.l_; }
00157     value_type operator*() const { return value_type(this->l_->key_); }
00158 
00159     private:
00160     const link *l_;
00161   };
00162 
00163   /***************************************************************************
00164     * \name Allocating the table.
00165    ***************************************************************************/
00167   private:
00168   void allocateTable() {
00169     if (capacity() == 0) {
00170       table_ = 0;
00171       return;
00172     }
00173     size_t nbytes = sizeof(*table_) * capacity();
00174     table_ = (link**)SmallMemoryPool::allocate_n(nbytes);
00175     memset(table_, 0, nbytes);
00176   }
00177 
00178   void freeTable(link **tab, size_t capacity) {
00179     if (tab != 0) {
00180       size_t nbytes = sizeof(*table_) * capacity;
00181       SmallMemoryPool::deallocate_n(tab, nbytes);
00182     }
00183   }
00185 
00186   public:
00193   FastHashBase(size_t size_hint) {
00194     if (size_hint == 0) {
00195       capacity_ = 0;
00196     } else {
00197       capacity_ = bithacks::ceil_pow2(size_hint);
00198     }
00199     n_ = 0;
00200     allocateTable();
00201   }
00202 
00206   FastHashBase() {
00207     capacity_ = 0;
00208     n_ = 0;
00209     allocateTable();
00210   }
00211 
00219   void reserve(size_t capacity) {
00220     capacity = bithacks::ceil_pow2(capacity);
00221     if (capacity > capacity_) {
00222       resize(capacity);
00223     }
00224   }
00225 
00226   public:
00238   ~FastHashBase() {
00239     if(collectStats) { collectstats(global_stats); }
00240     clear(true);
00241   }
00242 
00249   void clear(bool free_table = false) {
00250     for (size_t i = 0; i < capacity(); ++i) {
00251       link *l = table_[i];
00252       while(l) {
00253         link *next = l->next_;
00254         delete l;
00255         l = next;
00256       }
00257       table_[i] = NULL;
00258     }
00259     if (free_table) {
00260       freeTable(table_, capacity());
00261       capacity_ = 0;
00262       table_ = NULL;
00263     }
00264     n_ = 0;
00265   }
00266 
00267   /***************************************************************************
00268    * Compute some statistics to see if the hash function is good.
00269    * Helps allay fears at the very least.
00270    *
00271    * See FHS_Stats.
00272    ***************************************************************************/
00273   public:
00274   void collectstats(FHS_Stats& s) {
00275     size_t nbuckets = bithacks::log2(n_) + 1;
00276     if (s.histogram.size() < nbuckets) {
00277       s.histogram.resize(nbuckets);
00278     }
00279 
00280     s.N++;
00281     s.sumnitems += n_;
00282     s.sumcapacity += capacity();
00283     for(size_t i = 0; i < capacity(); ++i) {
00284       size_t n = 0;
00285       link *l = table_[i];
00286       while(l) { l = l->next_; n++; }
00287       if (n > 0) {
00288         s.sumnonempty++;
00289         s.histogram[bithacks::log2(n)]++;
00290       }
00291     }
00292   }
00293 
00294   /***************************************************************************
00295    ***************************************************************************/
00296   private:
00297   size_t mask() const {
00298     assert(capacity_ != 0);
00299     return capacity_ - 1;
00300   }
00301 
00302   size_t index(const Key& key) const {
00303     hasher h;
00304     return h(key) & mask();
00305   }
00306   /***************************************************************************
00307    ***************************************************************************/
00308   private:
00309   static link *findLink(const Key& key, link *l) {
00310     while (l && l->key_ != key) {
00311       l = l->next_;
00312     }
00313     return l;
00314   }
00315   static const link *findLink(const Key& key, const link *l) {
00316     while (l && l->key_ != key) {
00317       l = l->next_;
00318     }
00319     return l;
00320   }
00321 
00322   /***************************************************************************
00323     * \name find
00324     * Returns an iterator that points to the element that holds the key.
00325     * Returns end() if there is no such element.
00326    ***************************************************************************/
00328   public:
00329   const_iterator find(const Key& key) const {
00330     if (size() == 0) { return end(); }
00331     size_t i = index(key);
00332     const link *l = findLink(key, table_[i]);
00333     return const_iterator(l);
00334   }
00335   iterator find(const Key& key)  {
00336     if (size() == 0) { return end(); }
00337     size_t i = index(key);
00338     link *l = findLink(key, table_[i]);
00339     return iterator(l);
00340   }
00342 
00343   /***************************************************************************
00344    * Find-or-insert semantics: either insert the key, or return an iterator
00345    * to the previous copy.
00346    ***************************************************************************/
00347   public:
00348   pair<iterator,bool> insert(const Key& key) {
00349     if (capacity() == 0) {
00350       resize();
00351     }
00352 
00353     hasher h;
00354     size_t hashval = h(key);
00355     size_t i = hashval & mask();
00356     link *l = findLink(key, table_[i]);
00357     if (l) {
00358       // found it; not inserting this version of the key
00359       return make_pair(iterator(l), false);
00360     } else {
00361       // Not found; inserting this key.  Check if it fits.
00362       if (n_ == capacity()) {
00363         resize();
00364         i = hashval & mask(); // mask changed
00365       }
00366       link *l = new link(key);
00367       l->next_ = table_[i];
00368       table_[i] = l;
00369       n_++;
00370       return make_pair(iterator(l), true);
00371     }
00372   }
00373 
00377   private:
00378   void resize(size_t newcap) {
00379     link **oldtable = table_;
00380     size_t oldcap = capacity();
00381 
00382     capacity_ = newcap;
00383 
00384     allocateTable();
00385 
00386     // foreach bucket, foreach link, put the link where it belongs.
00387     for(size_t oldi = 0; oldi < oldcap; ++oldi) {
00388       link *l = oldtable[oldi];
00389       while(l) {
00390         link *next = l->next_;
00391         size_t i = index(l->key_);
00392         l->next_ = table_[i];
00393         table_[i] = l;
00394         l = next;
00395       }
00396     }
00397     freeTable(oldtable, oldcap);
00398   }
00399 
00400   void resize() {
00401     size_t oldcap = capacity();
00402 
00403     if (oldcap == 0) {
00404       resize(2);
00405     } else {
00406       resize(oldcap * 2);
00407     }
00408   }
00409 
00411   public:
00412   template <class key_iterator> void insert(key_iterator begin, const key_iterator& end) {
00413     while(begin != end) {
00414       insert(*begin);
00415       ++begin;
00416     }
00417   }
00418 
00419   /***************************************************************************
00420    * Remove the key.
00421    *
00422    * Return whether we removed anything.
00423    *
00424    * TODO: resize if we shrink too much?  At the moment, I do not, because
00425    * in all the applications where I erase, I only erase while growing the
00426    * overall structure.
00427    ***************************************************************************/
00428   public:
00429   bool erase(const Key& key) {
00430     if (size() == 0) {
00431       // this is to handle the zero-capacity case, but might as well optimize
00432       // for the zero-size case too.
00433       return false;
00434     }
00435     size_t i = index(key);
00436     link *l = table_[i];
00437     if (!l) { return false; }
00438     if (l->key_ == key) {
00439       table_[i] = l->next_;
00440       goto remove;
00441     }
00442 
00443     while(1) {
00444       link *prev = l;
00445       l = l->next_;
00446       if (!l) { return false; }
00447       if (l->key_ == key) {
00448         prev->next_ = l->next_;
00449         goto remove;
00450       }
00451     }
00452 remove:
00453     n_--;
00454     delete l;
00455     // TODO: if n falls too low compared to the capacity, resize.
00456     return true;
00457   }
00458 
00459 
00460   /***************************************************************************
00461    ***************************************************************************/
00462   public:
00464   size_t size() const { return n_; }
00465   size_t capacity() const { return capacity_; }
00466 
00467   /******************************
00468    * \name end
00469    *
00470    * An iterator corresponding to a failed search.  Note that there is
00471    * no begin, since the iterator is not a ForwardIterator. */
00473   iterator end() { return iterator(NULL); }
00474   const_iterator end() const { return const_iterator(NULL); }
00476 
00477 
00478   /***************************************************************************
00479    ***************************************************************************/
00480   private:
00481   size_t capacity_; 
00482   size_t n_;    
00483   link **table_;
00484 };
00485 
00486 
00487 
00488 /***************************************************************************
00489  ***************************************************************************
00490  ***************************************************************************/
00491 
00493 template <typename Key, class hasher = hashers::hash<Key>, bool NeedsDelete = true, bool collectStats = false>
00494 class FastHashSet : public FastHashBase<Key, void, hasher, NeedsDelete, collectStats> {
00495   private:
00496     typedef FastHashBase<Key, void, hasher, NeedsDelete, collectStats> super;
00497   public:
00498     FastHashSet() : super() { }
00499     FastHashSet(size_t size_hint) : super(size_hint) { }
00500 
00501     bool has(const Key& k) const { return this->find(k) != this->end(); }
00502 };
00503 
00504 /***************************************************************************
00505  ***************************************************************************/
00506 
00508 template <typename Key, typename Value, class hasher = hashers::hash<Key>, bool NeedsDelete = true, bool collectStats = false>
00509 class FastHashMap : public FastHashBase<Key, Value, hasher, NeedsDelete, collectStats> {
00510   private:
00511     typedef FastHashBase<Key, Value, hasher, NeedsDelete, collectStats> super;
00512   public:
00513     FastHashMap() : super() { }
00514     FastHashMap(size_t size_hint) : super(size_hint) { }
00515 
00517     Value& operator[] (const Key& k) {
00518       // Insert k; take the iterator; deref; get the value.
00519       // All-in-one to avoid temporaries.
00520       return (*(super::insert(k).first)).second;
00521     }
00522 
00523     Value& findOrDie (const Key& k) {
00524       typename super::iterator it = this->find(k);
00525       assert(it != this->end());
00526       return (*it).second;
00527     }
00528 
00529     const Value& findOrDie (const Key& k) const {
00530       typename super::const_iterator it = this->find(k);
00531       assert(it != this->end());
00532       return (*it).second;
00533     }
00534 
00536     pair<typename super::iterator,bool> insert(const Key& key, const Value& val) {
00537       pair<typename super::iterator, bool> p = super::insert(key);
00538       if (p.second) {
00539         (*(p.first)).second = val;
00540       }
00541       return p;
00542     }
00543 
00545     pair<typename super::iterator,bool> insert(const Key& key) {
00546       return super::insert(key);
00547     }
00548 };
00549 
00550 } // namespace hudson
00551 
00552 #endif

Generated on Thu Mar 27 19:04:14 2008 by  doxygen 1.4.6