-- Copyright 1993-1998, by the Cecil Project -- Department of Computer Science and Engineering, University of Washington -- See the LICENSE file for license information. -- These implement open hash tables. module OpenHashTableImpl { concrete object empty_hash_bucket; concrete object removed_hash_bucket; let use_linear_probing:bool := true; -- otherwise use quadratic probing method buckets_in_probing_order_do(v@:vector[`T], start_idx@:int, cl:&(int,T):void):void (** inline **) { if(use_linear_probing, { v.buckets_in_linear_probing_order_do(start_idx, cl); }, { v.buckets_in_quadratic_probing_order_do(start_idx, cl); }); } ------------------------------------------------------------------------------- -- Linear Probing: -- Implements simple linear probing. We could experiment with more interesting -- probing schemes. This returns when the indices wrap around, at which -- point the client has to take appropriate action based on the operation -- they are implementing. method buckets_in_linear_probing_order_do(v@:vector[`T], start_idx_oop@:int, cl:&(int,T):void ):void (** inline **) { (-- old version: no bounds checks, but calls closure in two places: v.range_do(start_idx, v.length, cl); v.range_do(0, start_idx, cl); --) (-- new version, in source code; calls (inlined) closure in only one spot let var idx:int := start_idx; let len := v.length; if(idx < 0 | { idx >= len }, { ^; }); loop({ eval(cl, idx, v!idx); idx := idx.succ; if(idx >= len, { -- wrap around idx := 0; }); if(idx = start_idx, { -- we've come back to the start -- (do this test in addition to wraparound, to handle the -- case of a 1-long vector that wraps around to the start) ^; }); }); --) (-- unfortunately, range analysis didn't seem to understand what the range of idx was above within the loop, so a bounds check (and error code) was introduced at the v!idx call. so below this is rewritten as an RTL primitive for speed and code compactness. --) prim rtl { decl int start_idx := unbox_int start_idx_oop; decl int idx := start_idx; decl int len := num_elems_int v; if idx <_int_log 0 goto done; if idx >=_int_log len goto done; label loop; decl OOP elem := v[idx] OOP; decl OOP idx_oop := box_int idx; send eval(cl, idx_oop, elem); idx := idx +_int 1; if idx <_int_log len goto no_wrap_around; idx := 0; label no_wrap_around; if idx !=_int_log start_idx goto loop; label done; return void; } } -- These only work for linear probing: -- Returns index of next bucket to probe, given the current index method next_probe(v@:vector[`T], idx:int):int (** inline **) { assert(use_linear_probing, "shouldn't be used for non-linear probing"); let next_idx:int := idx + 1; if(next_idx >= v.length, { 0 }, { next_idx }) } -- Returns index of next bucket to probe when going backwards method previous_probe(v@:vector[`T], idx:int):int (** inline **) { assert(use_linear_probing, "shouldn't be used for non-linear probing"); let prev_idx:int := idx - 1; if(prev_idx < 0, { v.length - 1 }, { prev_idx }) } ------------------------------------------------------------------------------- -- Quadratic Probing: (-- an alternative probing function that uses quadratic probing. i.e. search elements in this order: start_idx, start_idx+1^2, start_idx-1^2, start_idx+2^2, start_idx-2^2, start_idx+3^2, start_idx-3^2, etc. (all %v.length). note: next_probe and previous_probe can't be called with non-linear scanning, since we can't tell (and it's not unique! that's the whole point) what the next/previous bucket will be. --) method buckets_in_quadratic_probing_order_do(v@:vector[`T], start_idx@:int, cl:&(int,T):void ):void (** inline **) { (-- nice, high-level code: eval(cl, start_idx, v!start_idx); let len := v.length; for(1, round_up(sqrt(len), 2)/2, &(i:int){ let i_squared := square(i); let idx := (start_idx + i_squared) % len; eval(cl, idx, v!idx); let idx := (start_idx - i_squared) % len; eval(cl, idx, v!idx); }); --) -- low-level uglier code, written to avoid bounds checks and to -- only invoke the argument closure in one place (to avoid code expansion -- through repeated inlining of closures) prim rtl { decl int len := num_elems_int v; if len =_int_log 0 goto done; decl int i := 0; decl int signed_i_squared := 0; decl int untagged_start_idx := unbox_int start_idx; label loophead; decl int idx := untagged_start_idx +_int signed_i_squared; idx := idx %_int len; decl OOP val := v[idx] OOP; -- no bounds check! decl OOP tagged_idx := box_int idx; send eval(cl, tagged_idx, val); -- only one call to eval if signed_i_squared <=_int_log 0 goto non_positive; -- was positive: negate and retry signed_i_squared := 0 -_int signed_i_squared; goto loophead; label non_positive; -- was 0 or negative: increment and compute new positive squared value i := i +_int 1; signed_i_squared := i *_int i; if signed_i_squared <_int_log len goto loophead; label done; return void; } } --) ------------------------------------------------------------------------------- -- Other operations: -- A bunch of prime numbers. Goes up quickly for small table sizes (factor -- of 3), then slows down to a factor of 2 around 1000, then grows by 3/2 -- after 4000. let good_table_sizes:vector[int] := [11, 47, 197, 479, 1031, 2063, 3079, 4721, 6473, 9839, 15559, 24077, 35729, 45007, 70001, 115001, 180001, 270001, 405001, 607517, 911269, 1366877, 2050327, 3075481, 4613209, 6919807, 10379711, 15569581, 23354369, 35031527, 52547321, 78820913, 118231363, 177347039, 266020583, 399030859, 598546241, 897819337]; let var default_open_table_size:int := good_table_sizes.first; abstract object open_table[Key]; private var field key_vector(t@:open_table[`Key]):m_vector[Key] := new_m_vector[Key](default_open_table_size, cast[Key](empty_hash_bucket)); private put var field length(t@:open_table[`Key]):int := 0; private method should_grow(t@:open_table[`Key]):bool { -- more than half full... (t.length * 2) > t.key_vector.length & { allowing_updates.updates_allowed(t).not } } private method grown_size(t@:open_table[`Key]):int { let old_size:int := t.key_vector.length; good_table_sizes.find(&(new_size:int){ new_size > old_size }) } private method prime_table_size(size@:int):int { good_table_sizes.find(&(new_size:int){ new_size >= size }) } private method should_shrink(t@:open_table[`Key]):bool { -- less than 1/6 full... t.key_vector.length >= 20 & { t.length * 6 < t.key_vector.length & { allowing_updates.updates_allowed(t).not } } } private method shrunk_size(t@:open_table[`Key]):int { let suggested_new_len:int := t.length * 3; good_table_sizes.find(&(new_size:int){ new_size > suggested_new_len }) } -- When we remove an element, we want to try to avoid sticking in -- removed_hash_bucket, because they slow down future table operations. -- We are trying to keep the number of removed_hash_bucket values to a minimum. method remove_bucket(t@:open_table[`Key], idx:int):void { if(use_linear_probing, { let var next_idx:int := t.key_vector.next_probe(idx); if(t.key_vector.fetch(next_idx) == empty_hash_bucket, { let var cur_idx:int := idx; until_false({ t.key_vector.store(cur_idx, cast[Key](empty_hash_bucket)); next_idx := cur_idx; cur_idx := t.key_vector.previous_probe(cur_idx); }, { t.key_vector.fetch(cur_idx) == removed_hash_bucket }); ^ }); }); t.key_vector.store(idx, cast[Key](removed_hash_bucket)); } private method is_valid_bucket_element(t:any):bool { t !== empty_hash_bucket & { t !== removed_hash_bucket } } method remove_all(t@:open_table[`Key]):void { t.length := 0; if(t.should_shrink, { -- will create fresh, empty buckets t.key_vector := new_m_vector[Key](t.shrunk_size, cast[Key](empty_hash_bucket)); }, { -- need to clear those buckets that remain t.key_vector.keys_do(&(i:int){ t.key_vector.store(i, cast[Key](empty_hash_bucket)); }); }); } -- return a histogram computing, for each key in the key vector, how many -- probes are necessary to find it. this gives a sense of how many conflicts -- there are in the table method probe_histogram(t@:open_table[`Key <= hashable[Key]]):histogram[int] { let h := new_histogram[int](); let range := t.key_vector.length; t.key_vector.do(&(key:Key){ if(key.is_valid_bucket_element, { exit(&(exit:&():none){ let var num_probes:int := 1; t.key_vector.buckets_in_probing_order_do(hash(key, range), &(i:int,k:Key){ assert(k !== empty_hash_bucket, "should have found key"); if(k !== removed_hash_bucket & { key = k }, { h.increment(num_probes); eval(exit); }); num_probes := num_probes.succ; }); error("should have found key"); }); }); }); h } } -- end module OpenHashTableImpl module HashTable extends OpenHashTableImpl { --DOC `hash_table' is an implementation of tables that uses an open --DOC hashing algorithm, for hashable keys. -- hash tables using probing, implemented with two parallel vectors to hold -- the key_vector and value_vector. Two special sentinel objects, -- empty_hash_bucket and removed_hash_bucket, are used to indicate special -- bucket states. These tables are very space efficient. -- NOTE: the initial sizes and shrinking and growing sizes must be selected -- so that there is always at least one empty bucket in the table. The -- fetch and store operations will go into an infinite loop if such a bucket -- does not exist. template object hash_table[Key <= hashable[Key], Value] isa m_removable_table[Key,Value], open_table[Key]; private var field value_vector(t@:hash_table[`Key,`Value]):m_vector[Value] := new_m_vector[Value](default_open_table_size, cast[Value](empty_hash_bucket)); method do_associations(t@:hash_table[`Key,`Value], c:&(Key,Value):void):void { do(t.key_vector, t.value_vector, &(k:Key,v:Value){ if(k.is_valid_bucket_element, { eval(c, k, v); }); }); } method do_associations_allowing_updates(t@:hash_table[`Key,`Value], c:&(Key,Value):void):void { -- disable resizing during iteration allowing_updates.begin_allowing_updates(t); unwind_protect({ do_associations(t, c); }, { allowing_updates.done_allowing_updates(t); if(t.should_grow, { t.resize(t.grown_size); }, { if(t.should_shrink, { t.resize(t.shrunk_size); }); }); }); } method fetch(t@:hash_table[`Key,`Value], key:Key, if_absent:&():Value):Value { t.key_vector.buckets_in_probing_order_do(hash(key, t.key_vector.length), &(i:int,k:Key){ if(k !== empty_hash_bucket, { if(k !== removed_hash_bucket & { key = k }, { ^ t.value_vector.fetch(i) }); }, { ^ eval(if_absent) }); }); eval(if_absent) } -- don't write an optimized fetch_or_init; the if_init closure could -- recursively call add, resizing the table, and hence invalidating the -- cached position of where to add the initialized element. -- rely on the default table::fetch_or_init instead. -- store looks for an entry with the same key, and replaces it if one is found. -- if not, it attempts to reuse a slot containing a 'removed_hash_bucket' -- sentinel, if it saw one while it was searching. Otherwise, it uses a -- bucket containing 'empty_hash_bucket'. method store(t@:hash_table[`Key,`Value], key:Key, value:Value, if_absent:&():void):void { let var store_pos:int := -1; t.key_vector.buckets_in_probing_order_do(hash(key, t.key_vector.length), &(i:int,k:Key){ if(k !== empty_hash_bucket, { if(k !== removed_hash_bucket, { if(key = k, { t.value_vector.store(i, value); ^ }); }, { if(store_pos < 0, { store_pos := i; }); }); }, { if(store_pos < 0, { store_pos := i; }); t.length := t.length.succ; t.key_vector.store(store_pos, key); t.value_vector.store(store_pos, value); -- t.check_correctness; if(t.should_grow, { t.resize(t.grown_size); }); ^ }); }); assert(store_pos >= 0, "should have found a free bucket somewhere"); t.length := t.length.succ; t.key_vector.store(store_pos, key); t.value_vector.store(store_pos, value); -- t.check_correctness; if(t.should_grow, { t.resize(t.grown_size); }); } -- When we remove an element, we want to try to avoid sticking in -- removed_hash_bucket, because they slow down future table operations. -- We are trying to keep the number of removed_hash_bucket values to a minimum. private method remove_bucket(t@:hash_table[`Key, `Value], idx:int):void { if(use_linear_probing, { let next_idx:int := t.key_vector.next_probe(idx); if(t.key_vector.fetch(next_idx) == empty_hash_bucket, { let var cur_idx:int := idx; until_false({ t.key_vector.store(cur_idx, cast[Key](empty_hash_bucket)); t.value_vector.store(cur_idx, cast[Value](empty_hash_bucket)); cur_idx := t.key_vector.previous_probe(cur_idx); }, { t.key_vector.fetch(cur_idx) == removed_hash_bucket }); ^ }); }); t.key_vector.store(idx, cast[Key](removed_hash_bucket)); t.value_vector.store(idx, cast[Value](removed_hash_bucket)); } method remove_key(t@:hash_table[`Key,`Value], key:Key, if_absent:&():`Else):Value|Else { t.key_vector.buckets_in_probing_order_do(hash(key, t.key_vector.length), &(i:int,k:Key){ if(k !== empty_hash_bucket, { if(k !== removed_hash_bucket & { key = k }, { let v:Value := t.value_vector.fetch(i); t.remove_bucket(i); t.length := t.length.pred; if(t.should_shrink, { t.resize(t.shrunk_size); }); ^ v }); }, { ^ eval(if_absent) }); }); eval(if_absent) } method remove_keys_if(t@:hash_table[`Key,`Value], pred:&(Key):bool):int { let var count:int := 0; t.key_vector.do_associations(&(i:int, k:Key){ if(k.is_valid_bucket_element & { eval(pred, k) }, { t.length := t.length.pred; t.remove_bucket(i); count := count + 1; }); }); if(t.should_shrink, { t.resize(t.shrunk_size); }); count } method remove_all(t@:hash_table[`Key,`Value]):void { t.length := 0; if(t.should_shrink, { let new_sz:int := t.shrunk_size; t.key_vector := new_m_vector[Key](new_sz, cast[Key](empty_hash_bucket)); t.value_vector := new_m_vector[Value](new_sz, cast[Value](empty_hash_bucket)); }, { -- need to clear those buckets that remain t.key_vector.length.do(&(i:int){ t.key_vector.store(i, cast[Key](empty_hash_bucket)); t.value_vector.store(i, cast[Value](empty_hash_bucket)); }); }); } private method resize(t@:hash_table[`Key,`Value], new_size:int):void { -- t.check_correctness; let old_len:int := t.length; let old_key_vector:m_vector[Key] := t.key_vector; let old_value_vector:m_vector[Value] := t.value_vector; t.key_vector := new_m_vector[Key](new_size, cast[Key](empty_hash_bucket)); t.value_vector := new_m_vector[Value](new_size, cast[Value](empty_hash_bucket)); t.length := 0; do(old_key_vector, old_value_vector, &(k:Key, v:Value){ if(k.is_valid_bucket_element, { t.store(k, v); }); }); assert(old_len = t.length, "should have the same elements"); -- t.check_correctness; } -- this method can be called to verify the reasonableness of a probing -- hash table. currently, it only checks that the key and value tables -- have parallel empty_hash_bucket entries. method check_correctness(t@:hash_table[`Key,`Value]):void { good_table_sizes.find(&(valid_size:int){ t.key_vector.length=valid_size },{ "Table size is not one of the selected prime table sizes!".print_line; breakpoint(); }); do(t.key_vector, t.value_vector, &(k:Key, v:Value){ if((k == empty_hash_bucket) != (v == empty_hash_bucket), { "Broken hash table!".print_line; breakpoint(); }); }); } method new_hash_table[Key <= hashable[Key], Value]():hash_table[Key,Value] { new_hash_table[Key,Value](default_open_table_size) } method new_hash_table[Key <= hashable[Key], Value](size:int) :hash_table[Key,Value] { let real_size := prime_table_size(size); concrete object isa hash_table[Key,Value] { key_vector := new_m_vector[Key](real_size, cast[Key](empty_hash_bucket)), value_vector := new_m_vector[Value](real_size, cast[Value](empty_hash_bucket)) } } method copy_empty(t@:hash_table[`Key,`Value]):hash_table[Key,Value] { new_hash_table[Key,Value](t.key_vector.length) } method copy(t@:hash_table[`Key,`Value]):hash_table[Key,Value] { concrete object isa hash_table[Key,Value] { key_vector := t.key_vector.copy, value_vector := t.value_vector.copy, length := t.length } } -- -- table data structure with different printing behavior -- template object hash_CR_table[Key <= hashable[Key], Value] isa hash_table[Key,Value]; method elem_separator(t@:hash_CR_table[`Key,`Value]):string { "\n" } method new_hash_CR_table[Key <= hashable[Key], Value]() :hash_CR_table[Key,Value] { new_hash_CR_table[Key,Value](default_open_table_size) } method new_hash_CR_table[Key <= hashable[Key], Value](size:int) :hash_CR_table[Key,Value] { let real_size := prime_table_size(size); let t:hash_CR_table[Key,Value] := concrete object isa hash_CR_table[Key,Value] { key_vector := new_m_vector[Key](real_size, cast[Key](empty_hash_bucket)), value_vector := new_m_vector[Value](real_size, cast[Value](empty_hash_bucket)) }; t } method copy_empty(c@:hash_CR_table[`Key,`Value]):hash_CR_table[Key,Value] { new_hash_CR_table[Key,Value](c.key_vector.length) } method copy(t@:hash_CR_table[`Key,`Value]):hash_CR_table[Key,Value] { concrete object isa hash_CR_table[Key,Value] { key_vector := t.key_vector.copy, value_vector := t.value_vector.copy, length := t.length } } } -- end module HashTable