Skip to content

Commit

Permalink
add rune width
Browse files Browse the repository at this point in the history
  • Loading branch information
thatstoasty committed Jul 7, 2024
1 parent 00c3f2f commit e8eb8eb
Show file tree
Hide file tree
Showing 5 changed files with 1,427 additions and 27 deletions.
2 changes: 1 addition & 1 deletion gojo/unicode/__init__.mojo
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .utf8 import rune_count_in_string, UnicodeString
from .utf8 import rune_count_in_string, UnicodeString, rune_width, string_width, Condition, DEFAULT_CONDITION
1 change: 1 addition & 0 deletions gojo/unicode/utf8/__init__.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ This would not be possible without his help.
"""
from .runes import rune_count_in_string
from .string import UnicodeString
from .width import string_width, rune_width, Condition, DEFAULT_CONDITION
49 changes: 23 additions & 26 deletions gojo/unicode/utf8/string.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ alias simd_width_u8 = simdwidthof[DType.uint8]()

@value
struct UnicodeString(Stringable, Sized):
"""A string that supports Unicode characters of printable size 1
(ie not east asian characters and such.).
"""A string that supports Unicode characters.
The algorithms to handle UTF-8 are from @maxim on the Mojo Discord. Thanks!
"""
Expand Down Expand Up @@ -48,30 +47,28 @@ struct UnicodeString(Stringable, Sized):
fn __str__(self) -> String:
return self.inner

# @always_inline
# fn __getitem__(self, slice: Slice) -> String:
# # Copy N bytes + null terminator into new pointer and construct string.
# var copy_src = self.inner
# var copy = DTypePointer[DType.uint8](copy_src.unsafe_uint8_ptr())
# var bytes_left = len(self.inner)

# var result = DTypePointer[DType.uint8].alloc(len(self.inner))
# var total_char_length: Int = 0
# for _ in range(slice.start, slice.end):
# print(total_char_length, bytes_left)
# # Number of bytes of the current character
# var char_length = int((copy.load() >> 7 == 0).cast[DType.uint8]() * 1 + countl_zero(~copy.load()))

# memcpy(result.offset(total_char_length), copy, char_length)

# # Move iterator forward
# bytes_left -= char_length
# copy += char_length
# total_char_length += char_length
# print(total_char_length, char_length, bytes_left)

# result[total_char_length] = 0
# return StringRef(result, total_char_length + 1)
@always_inline
fn __getitem__(self: Reference[Self], slice: Slice) -> StringSlice[self.is_mutable, self.lifetime]:
"""TODO: Doesn't handle negative indices."""
var bytes_left = len(self[].inner)
var total_char_length: Int = 0
for _ in range(slice.start, slice.end):
# Number of bytes of the current character
var char_length = int(
(DTypePointer[DType.uint8](self[].inner.unsafe_uint8_ptr() + total_char_length).load() >> 7 == 0).cast[
DType.uint8
]()
* 1
+ countl_zero(~DTypePointer[DType.uint8](self[].inner.unsafe_uint8_ptr() + total_char_length).load())
)

# Move iterator forward
bytes_left -= char_length
total_char_length += char_length

return StringSlice[self.is_mutable, self.lifetime](
unsafe_from_utf8_ptr=self[].inner.unsafe_uint8_ptr(), len=total_char_length
)

@always_inline
fn bytecount(self) -> Int:
Expand Down
Loading

0 comments on commit e8eb8eb

Please sign in to comment.