package string_examples
import "core:testing"
import "core:unicode/utf8"
import "core:strings"
import "core:intrinsics"
expect :: testing.expect
want :: testing.expect_value
@(test)
// string <-> []u8 :: transmute
// these types are defined to have the same representation and can be
// transmuted in either direction.
transmutes :: proc(t: ^testing.T) {
s1 := []u8{'h', 'e', 'l', 'l', 'o'}
s2 := "hello"
s3 := transmute([]u8)s2
// s1 and s3 have same length
expect(t, len(s1) == len(s3))
// ... and same members
for _, i in s1 do expect(t, s1[i] == s3[i])
// s1 and s3's data are at different locations
expect(t, raw_data(s1) != raw_data(s3))
// ... because s3's data comes from s2
expect(t, raw_data(s3) == raw_data(s2))
want(t, transmute(string)s3, s2)
want(t, transmute(string)s1, s2)
}
@(test)
// cstring -> string :: cast
// this costs time (to find the length of the cstring) but not memory, as the
// cstring's bytes can be sliced.
from_cstrings :: proc(t: ^testing.T) {
untyped_constant :: "string literals can be cstrings too"
s1: cstring = untyped_constant
s2: string = untyped_constant
s3 := string(s1)
expect(t, s2 == s3)
expect(t, raw_data(s3) == transmute([^]u8)(s1))
}
@(test)
// ptr len -> string :: core:strings
// if you have the length, you can skip the C strlen()
from_ptrlen :: proc(t: ^testing.T) {
s1 := "hello"
ptr, ptrlen := raw_data(s1), len(s1)
s2 := strings.string_from_ptr(ptr, ptrlen)
expect(t, s1 == s2)
expect(t, raw_data(s1) == raw_data(s2))
// and you can do it 'manually'
s3 := transmute(string)((transmute([^]u8)ptr)[:ptrlen])
expect(t, s1 == s3)
expect(t, raw_data(s1) == raw_data(s3))
// this should always be the same as casting from cstring to string,
// but might be easier to follow if you're starting with a pointer to a
// messy buffer with a known size. NB. the resulting string doesn't
// have a trailing 0 if len(output) == provided len
s4 := "hello\x00world"
s5 := strings.string_from_nul_terminated_ptr(raw_data(s4), len(s4))
want(t, s5, "hello")
#no_bounds_check {
expect(t, (transmute([]u8)s5)[len(s5)] == 0) // 0 is there
}
}
@(test)
// string -> cstring :: core:strings
// strings can contain internal 0 bytes while not necessarily ending in a 0
// byte, so this is a tougher conversion, requiring either allocation or
// an element of danger.
to_cstrings :: proc(t: ^testing.T) {
s1 := "hello"
s2 := "hello\x00world"
s3 := strings.clone_to_cstring(s1, context.temp_allocator)
s4 := strings.unsafe_string_to_cstring(s2)
s5 := strings.clone_to_cstring(s2, context.temp_allocator)
expect(t, s3 == s4) // as cstrings these are both "hello\x00"
expect(t, transmute([^]u8)(s4) == raw_data(s2))
// internal zeroes mean that len might change
expect(t, len(s2) != len(s5))
}
@(test)
// [?]u8 -> string :: transmute
// a [?]u8 is an array but a string is a slice, so transmute is a no go...
// until the array is sliced. Then it's fine. Going the other way requires
// copying, unless you're happy with a transmuted pointer.
tofrom_arrays :: proc(t: ^testing.T) {
s1 := [?]u8{'h', 'e', 'l', 'l', 'o'}
s2 := "hello"
s3 := transmute(string)(s1[:])
s4 := transmute(^[5]u8)(raw_data(s2))
want(t, s3, s2)
want(t, transmute(string)(s4[:]), s2)
}
@(test)
// "constant strings" -> ??? :: transmute
// error: "Cannot transmute untyped expression:"
// You'll easily get this with string literals and constant strings, as they
// can't be transmuted without a type.
constants :: proc(t: ^testing.T) {
// s1 := transmute([]u8)("literals can't be transmuted?") // error!
untyped_constant :: "nor can constants?"
// s2 := transmute([]u8)untyped_constant // error!
typed_constant: string : "actually they can"
s3 := transmute([]u8)typed_constant // no error
s4 := transmute([]u8)(string("and literals too"))
// as string literals, the data is still stored in read-only memory.
// s4[0] = 'x' // segfault!
}
@(test)
// string -> []rune :: range loop
// looping over a string yields its 64-bit Unicode codepoints (runes), rather
// than bytes. To use them doesn't *require* dynamic allocation, but we'll be
// using it here.
rangeloop :: proc(t: ^testing.T) {
s1 := "hel\247\205\257lo" // invalid UTF-8
s2 := "Компью́тер" // Russian for 'computer', with a stress mark
s3 := "Компьютер" // Russian for 'computer', normally
r1 := []rune{'h', 'e', 'l', utf8.RUNE_ERROR, utf8.RUNE_ERROR, utf8.RUNE_ERROR, 'l', 'o'}
r2 := []rune{'h', 'e', 'l', 'l', 'o'}
stress :: '\u0301'
rs := make([dynamic]rune, context.temp_allocator)
// Currently, decoding errors result in the Unicode replacement
// character, so s1's invalid UTF-8 chars get included as
// '\ufffd' = 65533 = utf8.RUNE_ERROR
for r in s1 do append(&rs, r)
expect(t, len(rs) == len(r1))
for _, i in rs do expect(t, rs[i] == r1[i])
// Skipping the errors results in a match with r2, instead
clear(&rs)
for r in s1 do if r != utf8.RUNE_ERROR do append(&rs, r)
expect(t, len(rs) == len(r2))
for _, i in rs do expect(t, rs[i] == r2[i])
// Likewise skipping stress results in a match with s3
clear(&rs)
for r in s2 do if r != stress do append(&rs, r)
// NB. when looping over a string to get runes, the index is the *byte*
// index into the string.
i := 0
for r in s3 {
expect(t, r == rs[i])
i += 1
}
}
@(test)
// string <-> []rune :: core:unicode/utf8
// runes_to_string and string_to_runes both return a newly allocated result.
string_tofrom_runes :: proc(t: ^testing.T) {
s1 := "hel\247\205\257lo" // invalid UTF-8
r1 := utf8.string_to_runes(s1, context.temp_allocator)
r2 := []rune{'h', 'e', 'l', utf8.RUNE_ERROR, utf8.RUNE_ERROR, utf8.RUNE_ERROR, 'l', 'o'}
s2 := utf8.runes_to_string(r2, context.temp_allocator)
// non-utf8 doesn't round-trip, as the errors are folded into RUNE_ERROR
expect(t, s1 != s2)
// but r2 is indeed what you get out of decoding s1, the same result as
// looping over s1's runes (c.f. rangeloop(), below)
expect(t, len(r1) == len(r2))
for r, i in r1 do expect(t, r == r2[i])
// without utf8 errors it all...
s3 := utf8.string_to_runes("just works", context.temp_allocator)
s4 := utf8.runes_to_string(s3, context.temp_allocator)
want(t, s4, "just works")
}
@(test)
utf16_strings :: proc(t: ^testing.T) {
L :: intrinsics.constant_utf16_cstring
s1 :: "utf16 literal?"
s2: [^]u16 = L(s1)
// suitable for win32
// shorter: `s2 := L("utf16 literal?")`
// the rest is just confirming the above in a weird way
s3: []u16 = s2[:len(s1)]
s4: string = s1
s5: []u8 = transmute([]u8)s4
// true only because this would fit in an ASCII string
expect(t, len(s3) == len(s5))
for c, i in s3 do expect(t, c == u16(s5[i]))
}