# string type optimized for use in databases and small strings
#
# this is a implementation of the UmbraDB string type (also known as german string)
#
# when to use:
# - you know that your strings tend to be short (<= 12 bytes)
# - you need to use [.has-prefix] or [==] on a lot of strings:
#   if you have a hash table of strings, remember that it only uses
#   the comparision operators if there are hash collissions
#
# how it works:
# - the length of the string is stored on the stack (4B)
# - if the string fits into 12 bytes, it is stored as [UmbraShort] string,
#   which stores the 12 bytes of the string on the stack
# - if the string does not fit into 12 bytes, it is stored as [UmbraLong] string,
#   which stores the first 4 bytes of the string on the stack,
#   and the whole string (including the first 4 bytes!) on the heap
#
# problems with the implementation:
# - the compiler stores the type of the variant in the struct,
#   even though it can be gotten from the length.
#   that does not have a big performance impact in most applications
# - for [UmbraLong]: the [.nth] implementation loads from the prefix if the index is less than 4,
#   which is good in many scenarios, but the compiler does not know that it is
#   just as save to load from the memory directly (because of the if expression),
#   which hurts vectorization a lot.
#   If you know that that might happen, you might be able to use the [addr] function
#
# implementor's note:
# - there should be an alternative string implementation optimizes for even shorter strings (8 bytes),
#   which does not store the prefix
# - maybe null-terminate the string if, any only if, it is stored as [UmbraLong]

type UmbraShortLong (UmbraShort( arr:Array<U8,12> ))
                  | (UmbraLong(prefix:Array<U8,4> , ptr:U8[]));
type Umbra (Umbra( len:U32 , backing:UmbraShortLong ));

.length := λ(: s Umbra). (: (
   (as (.len s) U64)
) U64);

# using this in a vectorizable loop can break vectorization 
# if you know that your loop is vectorizable, consider using [addr]
[] := λ(: s Umbra)(: i U64). (: (
   (let result 0_u8)
   (match (.backing s) (
      ()
      ( (UmbraShort( arr ))
           (set result ([]( arr i ))) )
      ( (UmbraLong( prefix ptr )) 
           (set result (if (<(i 4_u64))
              ([]( prefix i ))
              ([]( (as ptr U8[]) i    )) )) )
   ))
   result
) U8);

# the returned pointer is NOT a C string!
# the returned array is only valid for [.length] bytes
# for [UmbraLong], returns the pointer to the heap data 
# for [UmbraShort], returns the pointer to the on-stack data
addr := λ(: s Umbra). (: (
   (let res (as 0_u64 U8[]))
   (match (.backing s) (
      ()
      ( (UmbraShort( arr ))
           (set res (as (& arr) U8[])) )
      ( (UmbraLong( _ ptr ))
           (set res ptr) )
   ))
   res 
) U8[]);

print := λ(: x Umbra). (: (
   (let idx 0_u64)
   (let ptr (addr x))
   (while (<(idx (.length x))) (
      (putchar(as ([]( ptr idx )) U32))
      (set idx (+( idx 1_u64 )))
   ))
) Nil);

short-prefix-matches := λ(: a Umbra)(: b Umbra). (: (
   (&&((&&((==(([](a 0_u64)) ([](b 0_u64))))
           (==(([](a 1_u64)) ([](b 1_u64))))))
       (&&((==(([](a 2_u64)) ([](b 2_u64))))
           (==(([](a 3_u64)) ([](b 3_u64))))))))
) U64);

# performance note: this is extremly fast if the pfx string is known to be <= 4 bytes at compile time
.has-prefix := λ(: base Umbra)(: pfx Umbra). (: (
   (if (>((.length pfx) (.length base)))
      (0_u64)
      ( (&&((short-prefix-matches(base pfx))
            (==((memcmp((as (addr base) ?[]) (as (addr pfx) ?[]) (.length pfx)))
                0_u32)) )
        )))
) U64);

== := λ(: l Umbra)(: r Umbra). (: (
   (if (!=((.length l) (.length r)))
      (0_u64)
      ( (.has-prefix(l r)) ))
) U64);

!= := λ(: l Umbra)(: r Umbra). (: (
   (not (==(l r)))
) U64);

deep-hash := λ(: key Umbra). (: (
   (let hash 0_u64)
   (let idx 0_u64)
   (let ptr (addr key))
   (while (<(idx (.length key))) (
      (set hash (+( hash (as ([](ptr idx)) U64) )))
      (set hash (+( hash (<<( hash 10_u64 )) )))
      (set hash (^( hash (>>( hash 6_u64 )) )))
      (set idx (+( idx 1_u64 )))
   ))
   (set hash (+( hash (<<( hash 3_u64 )) )))
   (set hash (^( hash (>>( hash 11_u64 )) )))
   (set hash (+( hash (<<( hash 15_u64 )) )))
   hash
) U64);