# string type optimized for use in databases and small strings # # this is a implementation of the UmbraDB string type (also known as german string) # # when to use: # - you know that your strings tend to be short (<= 12 bytes) # - you need to use [.has-prefix] or [==] on a lot of strings: # if you have a hash table of strings, remember that it only uses # the comparision operators if there are hash collissions # # how it works: # - the length of the string is stored on the stack (4B) # - if the string fits into 12 bytes, it is stored as [UmbraShort] string, # which stores the 12 bytes of the string on the stack # - if the string does not fit into 12 bytes, it is stored as [UmbraLong] string, # which stores the first 4 bytes of the string on the stack, # and the whole string (including the first 4 bytes!) on the heap # # problems with the implementation: # - the compiler stores the type of the variant in the struct, # even though it can be gotten from the length. # that does not have a big performance impact in most applications # - for [UmbraLong]: the [.nth] implementation loads from the prefix if the index is less than 4, # which is good in many scenarios, but the compiler does not know that it is # just as save to load from the memory directly (because of the if expression), # which hurts vectorization a lot. # If you know that that might happen, you might be able to use the [addr] function # # implementor's note: # - there should be an alternative string implementation optimizes for even shorter strings (8 bytes), # which does not store the prefix # - maybe null-terminate the string if, any only if, it is stored as [UmbraLong] type UmbraShortLong (UmbraShort( arr:Array )) | (UmbraLong(prefix:Array , ptr:U8[])); type Umbra (Umbra( len:U32 , backing:UmbraShortLong )); .length := λ(: s Umbra). (: ( (as (.len s) U64) ) U64); # using this in a vectorizable loop can break vectorization # if you know that your loop is vectorizable, consider using [addr] [] := λ(: s Umbra)(: i U64). (: ( (let result 0_u8) (match (.backing s) ( () ( (UmbraShort( arr )) (set result ([]( arr i ))) ) ( (UmbraLong( prefix ptr )) (set result (if (<(i 4_u64)) ([]( prefix i )) ([]( (as ptr U8[]) i )) )) ) )) result ) U8); # the returned pointer is NOT a C string! # the returned array is only valid for [.length] bytes # for [UmbraLong], returns the pointer to the heap data # for [UmbraShort], returns the pointer to the on-stack data addr := λ(: s Umbra). (: ( (let res (as 0_u64 U8[])) (match (.backing s) ( () ( (UmbraShort( arr )) (set res (as (& arr) U8[])) ) ( (UmbraLong( _ ptr )) (set res ptr) ) )) res ) U8[]); print := λ(: x Umbra). (: ( (let idx 0_u64) (let ptr (addr x)) (while (<(idx (.length x))) ( (putchar(as ([]( ptr idx )) U32)) (set idx (+( idx 1_u64 ))) )) ) Nil); short-prefix-matches := λ(: a Umbra)(: b Umbra). (: ( (&&((&&((==(([](a 0_u64)) ([](b 0_u64)))) (==(([](a 1_u64)) ([](b 1_u64)))))) (&&((==(([](a 2_u64)) ([](b 2_u64)))) (==(([](a 3_u64)) ([](b 3_u64)))))))) ) U64); # performance note: this is extremly fast if the pfx string is known to be <= 4 bytes at compile time .has-prefix := λ(: base Umbra)(: pfx Umbra). (: ( (if (>((.length pfx) (.length base))) (0_u64) ( (&&((short-prefix-matches(base pfx)) (==((memcmp((as (addr base) ?[]) (as (addr pfx) ?[]) (.length pfx))) 0_u32)) ) ))) ) U64); == := λ(: l Umbra)(: r Umbra). (: ( (if (!=((.length l) (.length r))) (0_u64) ( (.has-prefix(l r)) )) ) U64); != := λ(: l Umbra)(: r Umbra). (: ( (not (==(l r))) ) U64); deep-hash := λ(: key Umbra). (: ( (let hash 0_u64) (let idx 0_u64) (let ptr (addr key)) (while (<(idx (.length key))) ( (set hash (+( hash (as ([](ptr idx)) U64) ))) (set hash (+( hash (<<( hash 10_u64 )) ))) (set hash (^( hash (>>( hash 6_u64 )) ))) (set idx (+( idx 1_u64 ))) )) (set hash (+( hash (<<( hash 3_u64 )) ))) (set hash (^( hash (>>( hash 11_u64 )) ))) (set hash (+( hash (<<( hash 15_u64 )) ))) hash ) U64);