Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Count codepoints instead of bytes, when determining width #136

Merged
merged 2 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 63 additions & 19 deletions clap.zig
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ const testing = std.testing;
pub const args = @import("clap/args.zig");
pub const parsers = @import("clap/parsers.zig");
pub const streaming = @import("clap/streaming.zig");
pub const ccw = @import("clap/codepoint_counting_writer.zig");

test "clap" {
testing.refAllDecls(@This());
Expand Down Expand Up @@ -766,7 +767,7 @@ pub fn parseEx(
// fields to slices and return that.
var result_args = Arguments(Id, params, value_parsers, .slice){};
inline for (meta.fields(@TypeOf(arguments))) |field| {
if (@typeInfo(field.type) == .Struct and
if (@typeInfo(field.type) == .@"struct" and
@hasDecl(field.type, "toOwnedSlice"))
{
const slice = try @field(arguments, field.name).toOwnedSlice(allocator);
Expand Down Expand Up @@ -883,7 +884,7 @@ fn deinitArgs(
// If the multi value field is a struct, we know it is a list and should be deinited.
// Otherwise, it is a slice that should be freed.
switch (@typeInfo(@TypeOf(field))) {
.Struct => @field(arguments, longest.name).deinit(allocator),
.@"struct" => @field(arguments, longest.name).deinit(allocator),
else => allocator.free(@field(arguments, longest.name)),
}
}
Expand Down Expand Up @@ -936,7 +937,7 @@ fn Arguments(
i += 1;
}

return @Type(.{ .Struct = .{
return @Type(.{ .@"struct" = .{
.layout = .auto,
.fields = &fields,
.decls = &.{},
Expand Down Expand Up @@ -1153,10 +1154,10 @@ pub fn help(
const max_spacing = blk: {
var res: usize = 0;
for (params) |param| {
var cs = io.countingWriter(io.null_writer);
var cs = ccw.codepointCountingWriter(io.null_writer);
try printParam(cs.writer(), Id, param);
if (res < cs.bytes_written)
res = @intCast(cs.bytes_written);
if (res < cs.codepoints_written)
res = @intCast(cs.codepoints_written);
}

break :blk res;
Expand All @@ -1166,22 +1167,22 @@ pub fn help(
opt.description_indent +
max_spacing * @intFromBool(!opt.description_on_new_line);

var first_paramter: bool = true;
var first_parameter: bool = true;
for (params) |param| {
if (!first_paramter)
if (!first_parameter)
try writer.writeByteNTimes('\n', opt.spacing_between_parameters);

first_paramter = false;
first_parameter = false;
try writer.writeByteNTimes(' ', opt.indent);

var cw = io.countingWriter(writer);
var cw = ccw.codepointCountingWriter(writer);
try printParam(cw.writer(), Id, param);

const Writer = DescriptionWriter(@TypeOf(writer));
var description_writer = Writer{
.underlying_writer = writer,
.indentation = description_indentation,
.printed_chars = @intCast(cw.bytes_written),
.printed_chars = @intCast(cw.codepoints_written),
.max_width = opt.max_width,
};

Expand Down Expand Up @@ -1260,8 +1261,7 @@ pub fn help(
} else {
// For none markdown like format, we just respect the newlines in the input
// string and output them as is.
var i: usize = 0;
while (i < non_emitted_newlines) : (i += 1)
for (0..non_emitted_newlines) |_|
try description_writer.newline();
}

Expand Down Expand Up @@ -1292,7 +1292,7 @@ fn DescriptionWriter(comptime UnderlyingWriter: type) type {
debug.assert(word.len != 0);

var first_word = writer.printed_chars <= writer.indentation;
const chars_to_write = word.len + @intFromBool(!first_word);
const chars_to_write = try std.unicode.utf8CountCodepoints(word) + @intFromBool(!first_word);
if (chars_to_write + writer.printed_chars > writer.max_width) {
// If the word does not fit on this line, then we insert a new line and print
// it on that line. The only exception to this is if this was the first word.
Expand Down Expand Up @@ -1744,6 +1744,50 @@ test "clap.help" {
\\-d, --dd <V3>... Both repeated option.
\\
);

// Test with multibyte characters.
try testHelp(.{
.indent = 0,
.max_width = 46,
.description_on_new_line = false,
.description_indent = 4,
.spacing_between_parameters = 2,
},
\\-a Shört flåg.
\\
\\
\\-b <V1> Shört öptiön.
\\
\\
\\ --aa Löng fläg.
\\
\\
\\ --bb <V2> Löng öptiön.
\\
\\
\\-c, --cc Bóth fläg.
\\
\\
\\ --complicate Fläg wíth ä cömplǐcätéd
\\ änd vërý löng dèscrıptıön
\\ thät späns mültíplë
\\ lınēs.
\\
\\ Pärägräph number 2:
\\ * Bullet pöint
\\ * Bullet pöint
\\
\\ Exämple:
\\ sömething sömething
\\ sömething
\\
\\
\\-d, --dd <V3> Böth öptiön.
\\
\\
\\-d, --dd <V3>... Böth repeäted öptiön.
\\
);
}

/// Will print a usage message in the following format:
Expand All @@ -1752,18 +1796,18 @@ test "clap.help" {
/// First all none value taking parameters, which have a short name are printed, then non
/// positional parameters and finally the positional.
pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !void {
var cos = io.countingWriter(stream);
var cos = ccw.codepointCountingWriter(stream);
const cs = cos.writer();
for (params) |param| {
const name = param.names.short orelse continue;
if (param.takes_value != .none)
continue;

if (cos.bytes_written == 0)
if (cos.codepoints_written == 0)
try stream.writeAll("[-");
try cs.writeByte(name);
}
if (cos.bytes_written != 0)
if (cos.codepoints_written != 0)
try cs.writeAll("]");

var has_positionals: bool = false;
Expand All @@ -1782,7 +1826,7 @@ pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !voi
continue;
};

if (cos.bytes_written != 0)
if (cos.codepoints_written != 0)
try cs.writeAll(" ");

try cs.writeAll("[");
Expand All @@ -1806,7 +1850,7 @@ pub fn usage(stream: anytype, comptime Id: type, params: []const Param(Id)) !voi
if (param.names.short != null or param.names.long != null)
continue;

if (cos.bytes_written != 0)
if (cos.codepoints_written != 0)
try cs.writeAll(" ");

try cs.writeAll("<");
Expand Down
102 changes: 102 additions & 0 deletions clap/codepoint_counting_writer.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
const std = @import("std");
const builtin = @import("builtin");
const native_endian = builtin.cpu.arch.endian();

/// A Writer that counts how many codepoints has been written to it.
/// Expects valid UTF-8 input, and does not validate the input.
pub fn CodepointCountingWriter(comptime WriterType: type) type {
return struct {
codepoints_written: u64,
child_stream: WriterType,

pub const Error = WriterType.Error || error{Utf8InvalidStartByte};
pub const Writer = std.io.Writer(*Self, Error, write);

const Self = @This();

pub fn write(self: *Self, bytes: []const u8) Error!usize {
const bytes_and_codepoints = try utf8CountCodepointsAllowTruncate(bytes);
// Might not be the full input, so the leftover bytes are written on the next call.
const bytes_to_write = bytes[0..bytes_and_codepoints.bytes];
const amt = try self.child_stream.write(bytes_to_write);
const bytes_written = bytes_to_write[0..amt];
self.codepoints_written += (try utf8CountCodepointsAllowTruncate(bytes_written)).codepoints;
return amt;
}

pub fn writer(self: *Self) Writer {
return .{ .context = self };
}
};
}

// Like `std.unicode.utf8CountCodepoints`, but on truncated input, it returns
// the number of codepoints up to that point.
// Does not validate UTF-8 beyond checking the start byte.
fn utf8CountCodepointsAllowTruncate(s: []const u8) !struct { bytes: usize, codepoints: usize } {
var len: usize = 0;

const N = @sizeOf(usize);
const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);

var i: usize = 0;
while (i < s.len) {
// Fast path for ASCII sequences
while (i + N <= s.len) : (i += N) {
const v = std.mem.readInt(usize, s[i..][0..N], native_endian);
if (v & MASK != 0) break;
len += N;
}

if (i < s.len) {
const n = try std.unicode.utf8ByteSequenceLength(s[i]);
// Truncated input; return the current counts.
if (i + n > s.len) return .{ .bytes = i, .codepoints = len };

i += n;
len += 1;
}
}

return .{ .bytes = i, .codepoints = len };
}

pub fn codepointCountingWriter(child_stream: anytype) CodepointCountingWriter(@TypeOf(child_stream)) {
return .{ .codepoints_written = 0, .child_stream = child_stream };
}

const testing = std.testing;

test CodepointCountingWriter {
var counting_stream = codepointCountingWriter(std.io.null_writer);
const stream = counting_stream.writer();

const utf8_text = "blåhaj" ** 100;
Hejsil marked this conversation as resolved.
Show resolved Hide resolved
stream.writeAll(utf8_text) catch unreachable;
const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
try testing.expectEqual(expected_count, counting_stream.codepoints_written);
}

test "handles partial UTF-8 writes" {
var buf: [100]u8 = undefined;
var fbs = std.io.fixedBufferStream(&buf);
var counting_stream = codepointCountingWriter(fbs.writer());
const stream = counting_stream.writer();

const utf8_text = "ååå";
// `å` is represented as `\xC5\xA5`, write 1.5 `å`s.
var wc = try stream.write(utf8_text[0..3]);
// One should have been written fully.
try testing.expectEqual("å".len, wc);
try testing.expectEqual(1, counting_stream.codepoints_written);

// Write the rest, continuing from the reported number of bytes written.
wc = try stream.write(utf8_text[wc..]);
try testing.expectEqual(4, wc);
try testing.expectEqual(3, counting_stream.codepoints_written);

const expected_count = try std.unicode.utf8CountCodepoints(utf8_text);
try testing.expectEqual(expected_count, counting_stream.codepoints_written);

try testing.expectEqualSlices(u8, utf8_text, fbs.getWritten());
}
4 changes: 2 additions & 2 deletions clap/parsers.zig
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ test "enumeration" {
}

fn ReturnType(comptime P: type) type {
return @typeInfo(P).Fn.return_type.?;
return @typeInfo(P).@"fn".return_type.?;
}

pub fn Result(comptime P: type) type {
return @typeInfo(ReturnType(P)).ErrorUnion.payload;
return @typeInfo(ReturnType(P)).error_union.payload;
}
Loading