Skip to content

Commit

Permalink
fix scanner for input larger than internal buffer
Browse files Browse the repository at this point in the history
  • Loading branch information
thatstoasty committed Aug 28, 2024
1 parent 502e020 commit 7dfe559
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 17 deletions.
29 changes: 22 additions & 7 deletions benchmarks/scanner.mojo
Original file line number Diff line number Diff line change
@@ -1,19 +1,34 @@
import benchmark
import gojo.bufio
import gojo.bytes
import gojo.strings
import testing


fn benchmark_scan_runes() -> None:
var input = String("Hello, World!").as_bytes()
var buf = bytes.Buffer(buf=input^)
var scanner = bufio.Scanner[split = bufio.scan_runes](buf^)
alias FIRE = "🔥"


fn benchmark_scan_runes[batches: Int]() -> None:
var builder = strings.StringBuilder(capacity=batches)
for _ in range(batches):
_ = builder.write_string(FIRE)

var buf = bytes.Buffer(buf=str(builder).as_bytes())
var scanner = bufio.Scanner[split = bufio.scan_runes, capacity=batches](buf^)
while scanner.scan():
print(scanner.current_token())
_ = scanner.current_token()


fn main():
print("Running benchmark_scan_runes")
var report = benchmark.run[benchmark_scan_runes](max_iters=20)
# There's a time penalty for building the input text, for now.
print("Running benchmark_scan_runes - 100")
var report = benchmark.run[benchmark_scan_runes[100]](max_iters=20)
report.print(benchmark.Unit.ms)

print("Running benchmark_scan_runes - 1000")
report = benchmark.run[benchmark_scan_runes[1000]](max_iters=20)
report.print(benchmark.Unit.ms)

print("Running benchmark_scan_runes - 10000")
report = benchmark.run[benchmark_scan_runes[10000]](max_iters=20)
report.print(benchmark.Unit.ms)
1 change: 0 additions & 1 deletion gojo/bufio/bufio.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -785,7 +785,6 @@ struct Writer[W: io.Writer](Sized, io.Writer, io.ByteWriter, io.StringWriter, io

if err:
if bytes_written > 0 and bytes_written < self.bytes_written:
# TODO: Temp copying of elements until I figure out a better pattern or slice refs are added
var temp = self.as_bytes_slice()[bytes_written : self.bytes_written]
_ = copy(self.buf.unsafe_ptr().offset(self.buf.size), temp.unsafe_ptr(), len(temp))

Expand Down
24 changes: 16 additions & 8 deletions gojo/bufio/scan.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ from .bufio import MAX_CONSECUTIVE_EMPTY_READS
alias MAX_INT: Int = 2147483647


struct Scanner[R: io.Reader, split: SplitFunction = scan_lines]():
struct Scanner[R: io.Reader, split: SplitFunction = scan_lines, capacity: Int = io.BUFFER_SIZE]():
"""`Scanner` provides a convenient interface for reading data such as
a file of newline-delimited lines of text. Successive calls to
the `Scanner.scan` method will step through the 'tokens' of a file, skipping
Expand All @@ -25,7 +25,8 @@ struct Scanner[R: io.Reader, split: SplitFunction = scan_lines]():
large to fit in the `Scanner.buffer`. When a scan stops, the reader may have
advanced arbitrarily far past the last token. Programs that need more
control over error handling or large tokens, or must run sequential scans
on a reader, should use `bufio.Reader` instead."""
on a reader, should use `bufio.Reader` instead.
"""

var reader: R
"""The reader provided by the client."""
Expand All @@ -51,9 +52,10 @@ struct Scanner[R: io.Reader, split: SplitFunction = scan_lines]():
fn __init__(
inout self,
owned reader: R,
*,
max_token_size: Int = MAX_SCAN_TOKEN_SIZE,
token: List[UInt8, True] = List[UInt8, True](capacity=io.BUFFER_SIZE),
buf: List[UInt8, True] = List[UInt8, True](capacity=io.BUFFER_SIZE),
token: List[UInt8, True] = List[UInt8, True](capacity=capacity),
buf: List[UInt8, True] = List[UInt8, True](capacity=capacity),
start: Int = 0,
end: Int = 0,
empties: Int = 0,
Expand All @@ -62,6 +64,11 @@ struct Scanner[R: io.Reader, split: SplitFunction = scan_lines]():
):
"""Initializes a new Scanner.
Params:
R: The type of io.Reader.
split: The split function to use.
capacity: The capacity of the internal buffer.
Args:
reader: The reader to scan.
max_token_size: The maximum size of a token.
Expand Down Expand Up @@ -131,7 +138,7 @@ struct Scanner[R: io.Reader, split: SplitFunction = scan_lines]():
# a chance to recover any remaining, possibly empty token.
if (self.end > self.start) or self.err:
var advance: Int
var token = List[UInt8, True](capacity=io.BUFFER_SIZE)
var token = List[UInt8, True](capacity=capacity)
var err = Error()
var at_eof = False
if self.err:
Expand Down Expand Up @@ -179,6 +186,7 @@ struct Scanner[R: io.Reader, split: SplitFunction = scan_lines]():
memcpy(self.buf.unsafe_ptr(), self.buf.unsafe_ptr().offset(self.start), self.end - self.start)
self.end -= self.start
self.start = 0
self.buf.size = self.end

# Is the buffer full? If so, resize.
if self.end == len(self.buf):
Expand All @@ -193,8 +201,8 @@ struct Scanner[R: io.Reader, split: SplitFunction = scan_lines]():

# Make a new List[UInt8, True] buffer and copy the elements in
new_size = min(new_size, self.max_token_size)
var new_buf = List[UInt8, True](capacity=new_size)
_ = copy(new_buf, self.buf[self.start : self.end])
var new_buf = self.buf[self.start : self.end] # slicing returns a new list
new_buf.reserve(new_size)
self.buf = new_buf
self.end -= self.start
self.start = 0
Expand All @@ -204,8 +212,8 @@ struct Scanner[R: io.Reader, split: SplitFunction = scan_lines]():
# be extra careful: Scanner is for safe, simple jobs.
var loop = 0
while True:
var dest_ptr = self.buf.unsafe_ptr().offset(self.end)
# Catch any reader errors and set the internal error field to that err instead of bubbling it up.
var dest_ptr = self.buf.unsafe_ptr().offset(self.end)
var bytes_read: Int
var err: Error
bytes_read, err = self.reader._read(dest_ptr, self.buf.capacity - self.buf.size)
Expand Down
3 changes: 2 additions & 1 deletion gojo/bytes/buffer.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,8 @@ struct Buffer(

# Copy the data of the internal buffer from offset to len(buf) into the destination buffer at the given index.
var bytes_to_read = self.as_bytes_slice()[self.offset :]
var bytes_read = copy(dest, bytes_to_read.unsafe_ptr(), source_length=len(bytes_to_read))
var count = min(capacity, len(bytes_to_read))
var bytes_read = copy(dest, bytes_to_read.unsafe_ptr(), count)
self.offset += bytes_read

if bytes_read > 0:
Expand Down

0 comments on commit 7dfe559

Please sign in to comment.