odin notes: Artifact [17e217c9a4]

Artifact 17e217c9a43a2ed50ec7bfaac0e12364e527b93a221ea9276e2ae9e2f36ccea0:

File lang/strings.odin — part of check-in [27cc2008e2] at 2022-10-20 18:36:34 on branch trunk — lang/strings: add utf16 literals (user: admin size: 6676)
package string_examples

import "core:testing"
import "core:unicode/utf8"
import "core:strings"
import "core:intrinsics"

expect :: testing.expect
want :: testing.expect_value

@(test)
// string <-> []u8 :: transmute
// these types are defined to have the same representation and can be
// transmuted in either direction.
transmutes :: proc(t: ^testing.T) {
	s1 := []u8{'h', 'e', 'l', 'l', 'o'}
	s2 := "hello"
	s3 := transmute([]u8)s2

	// s1 and s3 have same length
	expect(t, len(s1) == len(s3))
	// ... and same members
	for _, i in s1 do expect(t, s1[i] == s3[i])

	// s1 and s3's data are at different locations
	expect(t, raw_data(s1) != raw_data(s3))
	// ... because s3's data comes from s2
	expect(t, raw_data(s3) == raw_data(s2))

	want(t, transmute(string)s3, s2)
	want(t, transmute(string)s1, s2)
}

@(test)
// cstring -> string :: cast
// this costs time (to find the length of the cstring) but not memory, as the
// cstring's bytes can be sliced.
from_cstrings :: proc(t: ^testing.T) {
	untyped_constant :: "string literals can be cstrings too"
	s1: cstring = untyped_constant
	s2: string = untyped_constant
	s3 := string(s1)

	expect(t, s2 == s3)
	expect(t, raw_data(s3) == transmute([^]u8)(s1))
}

@(test)
// ptr len -> string :: core:strings
// if you have the length, you can skip the C strlen()
from_ptrlen :: proc(t: ^testing.T) {
	s1 := "hello"
	ptr, ptrlen := raw_data(s1), len(s1)
	s2 := strings.string_from_ptr(ptr, ptrlen)

	expect(t, s1 == s2)
	expect(t, raw_data(s1) == raw_data(s2))

	// and you can do it 'manually'
	s3 := transmute(string)((transmute([^]u8)ptr)[:ptrlen])
	expect(t, s1 == s3)
	expect(t, raw_data(s1) == raw_data(s3))

	// this should always be the same as casting from cstring to string,
	// but might be easier to follow if you're starting with a pointer to a
	// messy buffer with a known size. NB. the resulting string doesn't
	// have a trailing 0 if len(output) == provided len
	s4 := "hello\x00world"
	s5 := strings.string_from_nul_terminated_ptr(raw_data(s4), len(s4))
	want(t, s5, "hello")
	#no_bounds_check {
		expect(t, (transmute([]u8)s5)[len(s5)] == 0) // 0 is there
	}
}

@(test)
// string -> cstring :: core:strings
// strings can contain internal 0 bytes while not necessarily ending in a 0
// byte, so this is a tougher conversion, requiring either allocation or
// an element of danger.
to_cstrings :: proc(t: ^testing.T) {
	s1 := "hello"
	s2 := "hello\x00world"
	s3 := strings.clone_to_cstring(s1, context.temp_allocator)
	s4 := strings.unsafe_string_to_cstring(s2)
	s5 := strings.clone_to_cstring(s2, context.temp_allocator)

	expect(t, s3 == s4) // as cstrings these are both "hello\x00"
	expect(t, transmute([^]u8)(s4) == raw_data(s2))

	// internal zeroes mean that len might change
	expect(t, len(s2) != len(s5))
}

@(test)
// [?]u8 -> string :: transmute
// a [?]u8 is an array but a string is a slice, so transmute is a no go...
// until the array is sliced. Then it's fine. Going the other way requires
// copying, unless you're happy with a transmuted pointer.
tofrom_arrays :: proc(t: ^testing.T) {
	s1 := [?]u8{'h', 'e', 'l', 'l', 'o'}
	s2 := "hello"
	s3 := transmute(string)(s1[:])
	s4 := transmute(^[5]u8)(raw_data(s2))

	want(t, s3, s2)
	want(t, transmute(string)(s4[:]), s2)
}

@(test)
// "constant strings" -> ??? :: transmute
// error: "Cannot transmute untyped expression:"
// You'll easily get this with string literals and constant strings, as they
// can't be transmuted without a type.
constants :: proc(t: ^testing.T) {
	// s1 := transmute([]u8)("literals can't be transmuted?") // error!
	untyped_constant :: "nor can constants?"
	// s2 := transmute([]u8)untyped_constant // error!

	typed_constant: string : "actually they can"
	s3 := transmute([]u8)typed_constant // no error
	s4 := transmute([]u8)(string("and literals too"))

	// as string literals, the data is still stored in read-only memory.
	// s4[0] = 'x' // segfault!
}

@(test)
// string -> []rune :: range loop
// looping over a string yields its 64-bit Unicode codepoints (runes), rather
// than bytes. To use them doesn't *require* dynamic allocation, but we'll be
// using it here.
rangeloop :: proc(t: ^testing.T) {
	s1 := "hel\247\205\257lo" // invalid UTF-8
	s2 := "Компью́тер" // Russian for 'computer', with a stress mark
	s3 := "Компьютер" // Russian for 'computer', normally
	r1 := []rune{'h', 'e', 'l', utf8.RUNE_ERROR, utf8.RUNE_ERROR, utf8.RUNE_ERROR, 'l', 'o'}
	r2 := []rune{'h', 'e', 'l', 'l', 'o'}
	stress :: '\u0301'
	rs := make([dynamic]rune, context.temp_allocator)

	// Currently, decoding errors result in the Unicode replacement
	// character, so s1's invalid UTF-8 chars get included as
	// '\ufffd' = 65533 = utf8.RUNE_ERROR
	for r in s1 do append(&rs, r)
	expect(t, len(rs) == len(r1))
	for _, i in rs do expect(t, rs[i] == r1[i])

	// Skipping the errors results in a match with r2, instead
	clear(&rs)
	for r in s1 do if r != utf8.RUNE_ERROR do append(&rs, r)
	expect(t, len(rs) == len(r2))
	for _, i in rs do expect(t, rs[i] == r2[i])

	// Likewise skipping stress results in a match with s3
	clear(&rs)
	for r in s2 do if r != stress do append(&rs, r)
	// NB. when looping over a string to get runes, the index is the *byte*
	// index into the string.
	i := 0
	for r in s3 {
		expect(t, r == rs[i])
		i += 1
	}
}

@(test)
// string <-> []rune :: core:unicode/utf8
// runes_to_string and string_to_runes both return a newly allocated result.
string_tofrom_runes :: proc(t: ^testing.T) {
	s1 := "hel\247\205\257lo" // invalid UTF-8
	r1 := utf8.string_to_runes(s1, context.temp_allocator)
	r2 := []rune{'h', 'e', 'l', utf8.RUNE_ERROR, utf8.RUNE_ERROR, utf8.RUNE_ERROR, 'l', 'o'}
	s2 := utf8.runes_to_string(r2, context.temp_allocator)

	// non-utf8 doesn't round-trip, as the errors are folded into RUNE_ERROR
	expect(t, s1 != s2)

	// but r2 is indeed what you get out of decoding s1, the same result as
	// looping over s1's runes (c.f. rangeloop(), below)
	expect(t, len(r1) == len(r2))
	for r, i in r1 do expect(t, r == r2[i])

	// without utf8 errors it all...
	s3 := utf8.string_to_runes("just works", context.temp_allocator)
	s4 := utf8.runes_to_string(s3, context.temp_allocator)
	want(t, s4, "just works")
}

@(test)
utf16_strings :: proc(t: ^testing.T) {
	L :: intrinsics.constant_utf16_cstring
	s1 :: "utf16 literal?"
	s2: [^]u16 = L(s1)
	// suitable for win32
	// shorter: `s2 := L("utf16 literal?")`

	// the rest is just confirming the above in a weird way
	s3: []u16 = s2[:len(s1)]
	s4: string = s1
	s5: []u8 = transmute([]u8)s4

	// true only because this would fit in an ASCII string
	expect(t, len(s3) == len(s5))
	for c, i in s3 do expect(t, c == u16(s5[i]))
}