diff --git a/CHANGES.md b/CHANGES.md index bf76f63c..c9621c68 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,14 @@ # Major changes to the IOCCC entry toolkit +## Release 1.5.24 2024-10-09 + +Synced `jparse` from [jparse repo](https://github.com/xexyl/jparse/). This +cleans up some code, removes code that's unnecessary and syncs the versions of +all tools to the same: `1.2.0 2024-10-09`. A new version string was added as +well, `JPARSE_UTF8_VERSION`. A bug in a script was also fixed. + + ## Release 1.5.23 2024-10-08 Noted dependency of `MAX_SUBMIT_SLOT` and `MAX_TARBALL_LEN` defines diff --git a/jparse/CHANGES.md b/jparse/CHANGES.md index 71f5d6fc..2031e939 100644 --- a/jparse/CHANGES.md +++ b/jparse/CHANGES.md @@ -1,5 +1,24 @@ # Significant changes in the JSON parser repo +## Release 1.2.0 2024-10-09 + +Remove `has_nul` in `struct json_string` as UTF-8 should, it is my +understanding, not have a NUL byte. + +Fix path in `jsemcgen.sh`. + +Update all tools and the release to be the same version after issue #13 was +resolved: `1.2.0 2024-10-09`. `1.2.0` was chosen because it was the first one > +some of the versions and the others could be bumped up to it without any harm. + +Do a final clean up of `json_utf8.[ch]`: removed all unnecessary code and macros +as well as clean up comments. + +Added a `JPARSE_UTF8_VERSION` in order to keep track of the current UTF-8 code. +Set this version to the same as the other versions at this time: `1.2.0 +2024-10-09`. + + ## Release 1.0.23 2024-10-08 Fix surrogate pair decoding in `json_decode()` / `decode_json_string()`. Now one diff --git a/jparse/jparse_bug_report.sh b/jparse/jparse_bug_report.sh index a14bfda3..700ce3ae 100755 --- a/jparse/jparse_bug_report.sh +++ b/jparse/jparse_bug_report.sh @@ -75,7 +75,7 @@ if [[ -z "$MAKE" ]]; then MAKE="$(type -P make)" fi export MAKE -export BUG_REPORT_VERSION="1.0.4 2024-06-26" +export BUG_REPORT_VERSION="1.2.0 2024-10-09" export FAILURE_SUMMARY= export NOTICE_SUMMARY= export DBG_LEVEL="0" diff --git a/jparse/jsemcgen.sh b/jparse/jsemcgen.sh index d99bf524..16149095 100755 --- a/jparse/jsemcgen.sh +++ b/jparse/jsemcgen.sh @@ -36,10 +36,10 @@ export MEMBER_FUNC= export OBJECT_FUNC= export ARRAY_FUNC= export UNKNOWN_FUNC= -export JSEMTBLGEN="../jparse/jsemtblgen" +export JSEMTBLGEN="./jsemtblgen" export JSEMTBLGEN_ARGS= export PATCH_TOOL= -export JSEMCGEN_VERSION="1.3 2023-02-04" +export JSEMCGEN_VERSION="1.2.0 2024-10-09" # attempt to fetch system specific paths to tools we need # diff --git a/jparse/jsemtblgen.h b/jparse/jsemtblgen.h index 65bf8f6a..2893b9b8 100644 --- a/jparse/jsemtblgen.h +++ b/jparse/jsemtblgen.h @@ -75,7 +75,7 @@ /* * official jsemtblgen version */ -#define JSEMTBLGEN_VERSION "1.0.1 2024-03-02" /* format: major.minor YYYY-MM-DD */ +#define JSEMTBLGEN_VERSION "1.2.0 2024-10-09" /* format: major.minor YYYY-MM-DD */ /* * jsemtblgen tool basename diff --git a/jparse/json_parse.c b/jparse/json_parse.c index 1993443c..66895bc0 100644 --- a/jparse/json_parse.c +++ b/jparse/json_parse.c @@ -169,7 +169,7 @@ struct byte2asciistr byte2asciistr[JSON_BYTE_VALUES] = { /* for json string decoding */ -static char *decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen, bool *has_nul); +static char *decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen); /* for json number strings */ static bool json_process_decimal(struct json_number *item, char const *str, size_t len); static bool json_process_floating(struct json_number *item, char const *str, size_t len); @@ -808,7 +808,6 @@ chkbyte2asciistr(void) * len length of block * mlen length of decoded bytes to allocate * retlen address of where to store allocated length, if retlen != NULL - * has_nul if != NULL and we find an encoded NUL byte we will do *has_nul = true * * returns: * allocated JSON decoding of a block, or NULL ==> error @@ -817,7 +816,7 @@ chkbyte2asciistr(void) * NOTE: this function is used by json_decode(). */ char * -decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen, bool *has_nul) +decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen) { char *ret = NULL; /* allocated encoding string or NULL */ char *beyond = NULL; /* beyond the end of the allocated encoding string */ @@ -1017,6 +1016,10 @@ decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen, boo * however, for p we need to update the entire amount */ p += bytes; + /* + * we increment by 5 because LITLEN("uxxxx") is 5: the for() loop + * increments by 1 at the increment/update phase. + */ i += 5; } else if (scanned == 2) { /* @@ -1047,12 +1050,12 @@ decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen, boo free(ret); ret = NULL; } - /* utf8encode warns on error */ + /* utf8encode() warns on error */ return NULL; } /* - * we skip 11 forwards because 5 (like above) + + * we increment by 11 because LITLEN("uxxxx") + * LITLEN("\\uxxxx") is 11. */ i += 11; @@ -1099,7 +1102,7 @@ decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen, boo */ dbg(DBG_VVVHIGH, "returning from decode_json_string(ptr, %ju, %ju, *%ju, %s)", - (uintmax_t)len, (uintmax_t)mlen, retlen != NULL ? *retlen : 0, has_nul != NULL ? booltostr(*has_nul) : "false"); + (uintmax_t)len, (uintmax_t)mlen, retlen != NULL ? *retlen : 0); if (retlen != NULL) { *retlen = mlen; } @@ -1114,14 +1117,13 @@ decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen, boo * ptr start of memory block to decode * len length of block to decode in bytes * retlen address of where to store allocated length, if retlen != NULL - * has_nul if != NULL and we find an encoded NUL byte we will do *has_nul = true * * returns: * allocated JSON decoding of a block, or NULL ==> error * NOTE: retlen, if non-NULL, is set to 0 on error */ char * -json_decode(char const *ptr, size_t len, size_t *retlen, bool *has_nul) +json_decode(char const *ptr, size_t len, size_t *retlen) { char *ret = NULL; /* allocated encoding string or NULL */ size_t mlen = 0; /* length of allocated encoded string */ @@ -1339,17 +1341,17 @@ json_decode(char const *ptr, size_t len, size_t *retlen, bool *has_nul) /* * decode JSON string */ - ret = decode_json_string(ptr, (uintmax_t)len, (uintmax_t)mlen, retlen, has_nul); + ret = decode_json_string(ptr, (uintmax_t)len, (uintmax_t)mlen, retlen); /* * return result, if not NULL */ if (ret != NULL) { dbg(DBG_VVVHIGH, "returning from json_decode(ptr, %ju, *%ju, %s)", - (uintmax_t)len, (uintmax_t)mlen, has_nul != NULL ? booltostr(*has_nul) : "false"); + (uintmax_t)len, (uintmax_t)mlen); } else { dbg(DBG_VVVHIGH, "in json_decode(): decode_json_string(ptr, %ju, *%ju, %s) returned NULL", - (uintmax_t)len, (uintmax_t)mlen, has_nul != NULL ? booltostr(*has_nul) : "false"); + (uintmax_t)len, (uintmax_t)mlen); if (retlen != NULL) { *retlen = 0; } @@ -1410,7 +1412,7 @@ json_decode_str(char const *str, size_t *retlen) /* * convert to json_decode() call */ - ret = json_decode(str, len, retlen, NULL); + ret = json_decode(str, len, retlen); if (ret == NULL) { dbg(DBG_VVHIGH, "returning NULL for decoding of: <%s>", str); } else { @@ -2946,7 +2948,6 @@ json_conv_string(char const *ptr, size_t len, bool quote) item->parsed = false; item->quote = false; item->same = false; - item->has_nul = false; item->slash = false; item->posix_safe = false; item->first_alphanum = false; @@ -3007,7 +3008,7 @@ json_conv_string(char const *ptr, size_t len, bool quote) * decode the JSON encoded string */ /* decode the entire string */ - item->str = json_decode(item->as_str, len, &(item->str_len), &(item->has_nul)); + item->str = json_decode(item->as_str, len, &(item->str_len)); if (item->str == NULL) { warn(__func__, "quote === %s: JSON string decode failed for: <%s>", booltostr(quote), item->as_str); diff --git a/jparse/json_parse.h b/jparse/json_parse.h index 40270bf9..aeb7692a 100644 --- a/jparse/json_parse.h +++ b/jparse/json_parse.h @@ -239,7 +239,6 @@ struct json_string bool quote; /* The original JSON string included surrounding '"'s */ bool same; /* true => as_str same as str, JSON decoding not required */ - bool has_nul; /* true ==> decoded JSON string has a NUL byte inside it */ bool slash; /* true ==> / was found after decoding */ bool posix_safe; /* true ==> all chars are POSIX portable safe plus + and maybe / after decoding */ @@ -491,7 +490,7 @@ extern struct byte2asciistr byte2asciistr[]; extern char *json_encode(char const *ptr, size_t len, size_t *retlen, bool skip_quote); extern char *json_encode_str(char const *str, size_t *retlen, bool skip_quote); extern void chkbyte2asciistr(void); -extern char *json_decode(char const *ptr, size_t len, size_t *retlen, bool *has_nul); +extern char *json_decode(char const *ptr, size_t len, size_t *retlen); extern char *json_decode_str(char const *str, size_t *retlen); extern struct json *parse_json_string(char const *string, size_t len); extern struct json *parse_json_bool(char const *string); diff --git a/jparse/json_utf8.c b/jparse/json_utf8.c index febefcbc..943e120b 100644 --- a/jparse/json_utf8.c +++ b/jparse/json_utf8.c @@ -153,16 +153,9 @@ count_utf8_bytes(const char *str, int32_t surrogate, size_t *bytes) } /* - * NOTE: until the bug documented at https://github.com/xexyl/jparse/issues/13 - * is resolved fully, we have code here that comes from a number of locations. - * Once the bug is resolved this file will be cleaned up. There are two - * different locations at this time (29 Sep 2024). - */ - -/* - * The below is based on code from + * The below function is based on code from * https://lxr.missinglinkelectronics.com/linux+v5.19/fs/unicode/mkutf8data.c, - * with additional code added. + * with a number of changes. */ /* @@ -276,202 +269,30 @@ utf8encode(char *str, unsigned int val) str[0] |= UTF8_4_BITS; len = 4; } else { - err(11, __func__, "%#X: illegal val\n", val); - not_reached(); + len = -1; + warn(__func__, "illegal value: %#X too big\n", val); + len = UNICODE_TOO_BIG; } return len; } - /* - * The above is based on code from + * The above function is based on code from * https://lxr.missinglinkelectronics.com/linux+v5.19/fs/unicode/mkutf8data.c, - * with additional code added. + * with a number of changes. */ /* - * The below table and code is from - * https://github.com/benkasminbullock/unicode-c/, which is 'a Unicode library - * in the programming language C which deals with conversions to and from the - * UTF-8 format', and was written by: - * - * Ben Bullock , + * -=-=-=---=-=-=-=-=-=-=-=-=-=-=-=---=-=-=-=-=-=-=-=-=-=-=-=---=-=-=-=-=-=-=-=-= */ - /* - * This is a Unicode library in the programming language C which deals with - * conversions to and from the UTF-8 format. - */ - -/* - Author: - - Ben Bullock , - - Repository: - - https://github.com/benkasminbullock/unicode-c -*/ - - -/* - * This table contains the length of a sequence which begins with the byte - * given. A value of zero indicates that the byte can not begin a UTF-8 - * sequence. This comes from: - * https://metacpan.org/source/CHANSEN/Unicode-UTF8-0.60/UTF8.xs#L8. - */ -const uint8_t utf8_sequence_len[0x100] = -{ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */ - 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */ - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */ - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */ - 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */ -}; - -/* - * This function converts UTF-8 encoded bytes in "input" into the equivalent - * Unicode code point. The return value is the Unicode code point corresponding - * to the UTF-8 character in "input" if successful, and a negative number if not - * successful. Nul bytes are rejected. - * - * "*end_ptr" is set to the next character after the read character on success. - * "*end_ptr" is set to the start of input on all failures. "end_ptr" may not - * be NULL. + * The below function is from https://github.com/benkasminbullock/unicode-c/, + * which is 'a Unicode library in the programming language C which deals with + * conversions to and from the UTF-8 format', and was written by: * - * If the first byte of "input" is zero, in other words a NUL or '\0', - * UNICODE_EMPTY_INPUT is returned. - * - * If the first byte of "input" is not valid UTF-8, UTF8_BAD_LEADING_BYTE is - * returned. - * - * If the second or later bytes of "input" are not valid UTF-8, including NUL, - * UTF8_BAD_CONTINUATION_BYTE is returned. - * - * If the value extrapolated from "input" is greater than UNICODE_MAXIMUM, - * UNICODE_TOO_BIG is returned. - * - * If the value extrapolated from "input" ends in 0xFFFF or 0xFFFE, - * UNICODE_NOT_CHARACTER is returned. - * - * If the value extrapolated from "input" is between 0xFDD0 and 0xFDEF, - * UNICODE_NOT_CHARACTER is returned. - * - * If the value is within the range of surrogate pairs, the error - * UNICODE_SURROGATE_PAIR is returned. + * Ben Bullock , */ -int32_t -utf8_to_ucs2 (const uint8_t * input, const uint8_t ** end_ptr) -{ - uint8_t c; - uint8_t l; - - *end_ptr = input; - c = input[0]; - if (c == 0) { - return UNICODE_EMPTY_INPUT; - } - l = utf8_sequence_len[c]; - if (l == 1) { - * end_ptr = input + 1; - return (int32_t) c; - } - if (l == 2) { - uint8_t d; - d = input[1]; - /* Two byte case. */ - if (d < 0x80 || d > 0xBF) { - return UTF8_BAD_CONTINUATION_BYTE; - } - if (c <= 0xC1) { - return UTF8_BAD_CONTINUATION_BYTE; - } - * end_ptr = input + 2; - return - ((int32_t) (c & 0x1F) << 6) | - ((int32_t) (d & 0x3F)); - } - if (l == 3) { - uint8_t d; - uint8_t e; - int32_t r; - - d = input[1]; - e = input[2]; - /* Three byte case. */ - if (d < 0x80 || d > 0xBF || - e < 0x80 || e > 0xBF) { - return UTF8_BAD_CONTINUATION_BYTE; - } - if (c == 0xe0 && d < 0xa0) { - /* We don't need to check the value of input[2], because - the if statement above this one already guarantees that - it is 10xxxxxx. */ - return UTF8_BAD_CONTINUATION_BYTE; - } - r = ((int32_t) (c & 0x0F)) << 12 | - ((int32_t) (d & 0x3F)) << 6 | - ((int32_t) (e & 0x3F)); - REJECT_SURROGATE(r); - REJECT_FFFF(r); - REJECT_NOT_CHAR(r); - * end_ptr = input + 3; - return r; - } - else if (l == 4) { - /* Four byte case. */ - uint8_t d; - uint8_t e; - uint8_t f; - int32_t v; - - d = input[1]; - e = input[2]; - f = input[3]; - - if (/* c must be 11110xxx. */ - c >= 0xf8 || - /* d, e, f must be 10xxxxxx. */ - d < 0x80 || d >= 0xC0 || - e < 0x80 || e >= 0xC0 || - f < 0x80 || f >= 0xC0) { - return UTF8_BAD_CONTINUATION_BYTE; - } - - if (c == 0xf0 && d < 0x90) { - /* We don't need to check the values of e and f, because - the if statement above this one already guarantees that - e and f are 10xxxxxx. */ - return UTF8_BAD_CONTINUATION_BYTE; - } - /* Calculate the code point. */ - v = FOUR (input); - /* Greater than U+10FFFF */ - if (v > UNICODE_MAXIMUM) { - return UNICODE_TOO_BIG; - } - /* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */ - REJECT_FFFF(v); - /* We don't need to check for surrogate pairs here, since the - minimum value of UCS2 if there are four bytes of UTF-8 is - 0x10000. */ - * end_ptr = input + 4; - return v; - } - return UTF8_BAD_LEADING_BYTE; -} /* @@ -496,34 +317,13 @@ surrogates_to_unicode (int32_t hi, int32_t lo) } /* - * Given a nul-terminated string "utf8", return the total number of Unicode - * characters it contains. + * The above function is from https://github.com/benkasminbullock/unicode-c/, + * which is 'a Unicode library in the programming language C which deals with + * conversions to and from the UTF-8 format', and was written by: * - * If an error occurs, this may return UTF8_BAD_LEADING_BYTE or any of the - * errors of "utf8_to_ucs2". + * Ben Bullock , + */ + +/* + * -=-=-=---=-=-=-=-=-=-=-=-=-=-=-=---=-=-=-=-=-=-=-=-=-=-=-=---=-=-=-=-=-=-=-=-= */ -int32_t -unicode_count_chars (const uint8_t * utf8) -{ - int32_t chars = 0; - const uint8_t * p = utf8; - int32_t len = strlen ((const char *) utf8); - if (len == 0) { - return 0; - } - while (p - utf8 < len) { - int32_t ucs2; - ucs2 = utf8_to_ucs2 (p, & p); - if (ucs2 < 0) { - /* Return the error from utf8_to_ucs2. */ - return ucs2; - } - chars++; - if (*p == '\0') { - return chars; - } - } - /* Cannot be reached in practice, since strlen indicates the null - byte. */ - return UTF8_BAD_LEADING_BYTE; -} diff --git a/jparse/json_utf8.h b/jparse/json_utf8.h index 16956692..f79c67e5 100644 --- a/jparse/json_utf8.h +++ b/jparse/json_utf8.h @@ -33,69 +33,20 @@ */ #include "util.h" -extern bool count_utf8_bytes(const char *str, int32_t surrogate, size_t *bytes); - /* - * NOTE: until the bug documented at https://github.com/xexyl/jparse/issues/13 - * is resolved fully, we have code here that comes from a number of locations. - * Once the bug is resolved this file will be cleaned up. There are two - * different locations at this time (29 Sep 2024). + * official jparse UTF-8 version */ +#define JPARSE_UTF8_VERSION "1.2.0 2024-10-09" /* format: major.minor YYYY-MM-DD */ + + +extern bool count_utf8_bytes(const char *str, int32_t surrogate, size_t *bytes); /* - * The below comes from + * The below function and macros are based on code from * https://lxr.missinglinkelectronics.com/linux+v5.19/fs/unicode/mkutf8data.c, - * with pointer checks added to the functions. + * with a number of changes. */ -/* - * UTF8 valid ranges. - * - * The UTF-8 encoding spreads the bits of a 32bit word over several - * bytes. This table gives the ranges that can be held and how they'd - * be represented. - * - * 0x00000000 0x0000007F: 0xxxxxxx - * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx - * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx - * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - * - * There is an additional requirement on UTF-8, in that only the - * shortest representation of a 32bit value is to be used. A decoder - * must not decode sequences that do not satisfy this requirement. - * Thus the allowed ranges have a lower bound. - * - * 0x00000000 0x0000007F: 0xxxxxxx - * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx - * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx - * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - * - * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, - * 17 planes of 65536 values. This limits the sequences actually seen - * even more, to just the following. - * - * 0 - 0x7f: 0 0x7f - * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf - * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf - * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf - * - * Even within those ranges not all values are allowed: the surrogates - * 0xd800 - 0xdfff should never be seen. - * - * Note that the longest sequence seen with valid usage is 4 bytes, - * the same a single UTF-32 character. This makes the UTF-8 - * representation of Unicode strictly smaller than UTF-32. - * - * The shortest sequence requirement was introduced by: - * Corrigendum #1: UTF-8 Shortest Form - * It can be found here: - * http://www.unicode.org/versions/corrigendum1.html - * - */ #define UTF8_2_BITS 0xC0 #define UTF8_3_BITS 0xE0 #define UTF8_4_BITS 0xF0 @@ -110,44 +61,23 @@ extern bool count_utf8_bytes(const char *str, int32_t surrogate, size_t *bytes); extern int utf8encode(char *str, unsigned int val); /* - * The above comes from + * The above function and macros are based on code from * https://lxr.missinglinkelectronics.com/linux+v5.19/fs/unicode/mkutf8data.c, - * with pointer checks added to the functions. + * with a number of changes. */ - /* - * The below is from https://github.com/benkasminbullock/unicode-c/, which is 'a - * Unicode library in the programming language C which deals with conversions to - * and from the UTF-8 format', and was written by: - * - * Ben Bullock , + * -=-=-=---=-=-=-=-=-=-=-=-=-=-=-=---=-=-=-=-=-=-=-=-=-=-=-=---=-=-=-=-=-=-=-=-= */ /* - * This macro converts four bytes of UTF-8 into the corresponding code point. + * The below function and macros come from + * https://github.com/benkasminbullock/unicode-c/, which is 'a Unicode library + * in the programming language C which deals with conversions to and from the + * UTF-8 format', and was written by: + * + * Ben Bullock , */ -#define FOUR(x) \ - (((int32_t) (x[0] & 0x07)) << 18) \ - | (((int32_t) (x[1] & 0x3F)) << 12) \ - | (((int32_t) (x[2] & 0x3F)) << 6) \ - | (((int32_t) (x[3] & 0x3F))) - -/* Reject code points which end in either FFFE or FFFF. */ -#define REJECT_FFFF(x) \ - if ((x & 0xFFFF) >= 0xFFFE) { \ - return UNICODE_NOT_CHARACTER; \ - } -/* Reject code points in a certain range. */ -#define REJECT_NOT_CHAR(r) \ - if (r >= UNI_NOT_CHAR_MIN && r <= UNI_NOT_CHAR_MAX) { \ - return UNICODE_NOT_CHARACTER; \ - } - -#define REJECT_FE_FF(c) \ - if (c == 0xFF || c == 0xFE) { \ - return UNICODE_NOT_CHARACTER; \ - } /* Surrogate pair zone. */ #define UNI_SUR_HIGH_START 0xD800 @@ -155,13 +85,6 @@ extern int utf8encode(char *str, unsigned int val); #define UNI_SUR_LOW_START 0xDC00 #define UNI_SUR_LOW_END 0xDFFF -/* Reject surrogates. */ -#define REJECT_SURROGATE(ucs2) \ - if (ucs2 >= UNI_SUR_HIGH_START && ucs2 <= UNI_SUR_LOW_END) { \ - /* Ill-formed. */ \ - return UNICODE_SURROGATE_PAIR; \ - } - /* Start of the "not character" range. */ #define UNI_NOT_CHAR_MIN 0xFDD0 /* End of the "not character" range. */ @@ -171,30 +94,6 @@ extern int utf8encode(char *str, unsigned int val); #define TEN_BITS 10 #define HALF_BASE 0x0010000UL -/* - * The maximum number of bytes we need to contain any Unicode code point as - * UTF-8 as a C string. This length includes one trailing nul byte. - */ -#define UTF8_MAX_LENGTH 5 -/* - * The maximum possible value of a Unicode code point. See - * http://www.cl.cam.ac.uk/~mgk25/unicode.html#ucs. - */ -#define UNICODE_MAXIMUM 0x10ffff -/* The maximum possible value which will fit into four bytes of UTF-8. This is - * larger than UNICODE_MAXIMUM. - */ -#define UNICODE_UTF8_4 0x1fffff -/* - * This return value indicates the successful completion of a routine which - * doesn't use the return value to communicate data back to the caller. - */ -#define UNICODE_OK 0 -/* - * This return value means that the leading byte of a UTF-8 sequence was not - * valid. - */ -#define UTF8_BAD_LEADING_BYTE -1 /* * This return value means the caller attempted to turn a code point for a * surrogate pair to or from UTF-8. @@ -206,25 +105,7 @@ extern int utf8encode(char *str, unsigned int val); * pair. */ #define UNICODE_NOT_SURROGATE_PAIR -3 -/* - * This return value means that input which was supposed to be UTF-8 encoded - * contained an invalid continuation byte. If the leading byte of a UTF-8 - * sequence is not valid, UTF8_BAD_LEADING_BYTE is returned instead of this. - */ -#define UTF8_BAD_CONTINUATION_BYTE -4 -/* - * This return value indicates a zero byte was found in a string which was - * supposed to contain UTF-8 bytes. It is returned only by the functions which - * are documented as not allowing zero bytes. - */ -#define UNICODE_EMPTY_INPUT -5 -/* - * This return value indicates that UTF-8 bytes were not in the shortest - * possible form. See http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8. This - * return value is currently unused. If a character is not in the shortest form, - * the error UTF8_BAD_CONTINUATION_BYTE is returned. - */ -#define UTF8_NON_SHORTEST -6 + /* * This return value indicates that there was an attempt to convert a code point * which was greater than UNICODE_MAXIMUM or UNICODE_UTF8_4 into UTF-8 bytes. @@ -237,22 +118,19 @@ extern int utf8encode(char *str, unsigned int val); */ #define UNICODE_NOT_CHARACTER -8 -extern const uint8_t utf8_sequence_len[]; - -/* - * All of the functions in this library return an "int32_t". Negative values are - * used to indicate errors. - */ -extern int32_t utf8_to_ucs2 (const uint8_t* input, const uint8_t** end_ptr); extern int32_t surrogates_to_unicode (int32_t hi, int32_t lo); -extern int32_t unicode_count_chars (const uint8_t* utf8); /* - * The above is from https://github.com/benkasminbullock/unicode-c/, which is 'a - * Unicode library in the programming language C which deals with conversions to - * and from the UTF-8 format', and was written by: + * The above macros and function from + * https://github.com/benkasminbullock/unicode-c/, which is 'a Unicode library + * in the programming language C which deals with conversions to and from the + * UTF-8 format', and was written by: * * Ben Bullock , */ +/* + * -=-=-=---=-=-=-=-=-=-=-=-=-=-=-=---=-=-=-=-=-=-=-=-=-=-=-=---=-=-=-=-=-=-=-=-= + */ + #endif /* INCLUDE_JSON_UTF8_H */ diff --git a/jparse/json_util.c b/jparse/json_util.c index 8e7e38fe..5ee24fd5 100644 --- a/jparse/json_util.c +++ b/jparse/json_util.c @@ -1668,7 +1668,6 @@ vjson_fprint(struct json *node, unsigned int depth, va_list ap) CONVERTED_PARSED_JSON_NODE(item)?"c:":"", item->quote ? "q" : "", item->same ? "=" : "", - item->has_nul ? "0" : "", item->slash ? "/" : "", item->posix_safe ? "P" : "", item->first_alphanum ? "a" : "", diff --git a/jparse/jstrdecode.c b/jparse/jstrdecode.c index c83b44a9..c22627b5 100644 --- a/jparse/jstrdecode.c +++ b/jparse/jstrdecode.c @@ -40,11 +40,6 @@ */ #define REQUIRED_ARGS (0) /* number of required arguments on the command line */ -/* - * official jstrdecode version - */ -#define JSTRDECODE_VERSION "1.0.6 2024-10-08" /* format: major.minor YYYY-MM-DD */ - /* * usage message * @@ -221,7 +216,7 @@ jstrdecode_stream(FILE *in_stream) /* * decode data read from input stream */ - buf = json_decode(input, inputlen, &bufsiz, NULL); + buf = json_decode(input, inputlen, &bufsiz); if (buf == NULL) { /* free input */ if (input != NULL) { diff --git a/jparse/jstrdecode.h b/jparse/jstrdecode.h index d2035096..1fc2aa06 100644 --- a/jparse/jstrdecode.h +++ b/jparse/jstrdecode.h @@ -66,6 +66,11 @@ */ #include "version.h" +/* + * official jstrdecode version + */ +#define JSTRDECODE_VERSION "1.2.0 2024-10-09" /* format: major.minor YYYY-MM-DD */ + /* * jstrdecode tool basename diff --git a/jparse/jstrencode.c b/jparse/jstrencode.c index c178ab03..fa18490f 100644 --- a/jparse/jstrencode.c +++ b/jparse/jstrencode.c @@ -40,11 +40,6 @@ */ #define REQUIRED_ARGS (0) /* number of required arguments on the command line */ -/* - * official jstrencode version - */ -#define JSTRENCODE_VERSION "1.1.3 2024-10-08" /* format: major.minor YYYY-MM-DD */ - /* * usage message * diff --git a/jparse/jstrencode.h b/jparse/jstrencode.h index 588830c2..ed5f27c8 100644 --- a/jparse/jstrencode.h +++ b/jparse/jstrencode.h @@ -66,6 +66,11 @@ */ #include "version.h" +/* + * official jstrencode version + */ +#define JSTRENCODE_VERSION "1.2.0 2024-10-09" /* format: major.minor YYYY-MM-DD */ + /* * jstrencode tool basename diff --git a/jparse/run_bison.sh b/jparse/run_bison.sh index 3f7adca2..74eec7c9 100755 --- a/jparse/run_bison.sh +++ b/jparse/run_bison.sh @@ -20,7 +20,7 @@ # setup # -export RUN_BISON_VERSION="1.0.1 2024-03-02" +export RUN_BISON_VERSION="1.2.0 2024-10-09" export BISON_BASENAME="bison" export PREFIX="jparse" export SORRY_H="sorry.tm.ca.h" diff --git a/jparse/run_flex.sh b/jparse/run_flex.sh index 8fc2197e..058756ed 100755 --- a/jparse/run_flex.sh +++ b/jparse/run_flex.sh @@ -20,7 +20,7 @@ # setup # -export RUN_FLEX_VERSION="1.0.1 2024-03-02" +export RUN_FLEX_VERSION="1.2.0 2024-10-09" export FLEX_BASENAME="flex" export PREFIX="jparse" export SORRY_H="sorry.tm.ca.h" diff --git a/jparse/test_jparse/is_available.sh b/jparse/test_jparse/is_available.sh index 482ed659..4190e794 100755 --- a/jparse/test_jparse/is_available.sh +++ b/jparse/test_jparse/is_available.sh @@ -16,7 +16,7 @@ # # Share and enjoy! :-) -export VERSION="1.1.1 2024-10-01" +export VERSION="1.2.0 2024-10-09" NAME=$(basename "$0") export NAME export PRINT_WHERE="" diff --git a/jparse/test_jparse/jnum_chk.c b/jparse/test_jparse/jnum_chk.c index 352fdc8b..d4a130e9 100644 --- a/jparse/test_jparse/jnum_chk.c +++ b/jparse/test_jparse/jnum_chk.c @@ -41,11 +41,6 @@ */ #define REQUIRED_ARGS (0) /* number of required arguments on the command line */ -/* - * official jnum_chk version - */ -#define JNUM_CHK_VERSION "1.0.1 2024-03-02" /* format: major.minor YYYY-MM-DD */ - /* * usage message * diff --git a/jparse/test_jparse/jnum_chk.h b/jparse/test_jparse/jnum_chk.h index f88393b3..8714f9aa 100644 --- a/jparse/test_jparse/jnum_chk.h +++ b/jparse/test_jparse/jnum_chk.h @@ -56,7 +56,10 @@ */ #include "../version.h" - +/* + * official jnum_chk version + */ +#define JNUM_CHK_VERSION "1.0.1 2024-03-02" /* format: major.minor YYYY-MM-DD */ /* diff --git a/jparse/test_jparse/jnum_gen.c b/jparse/test_jparse/jnum_gen.c index 0dc32597..db136d67 100644 --- a/jparse/test_jparse/jnum_gen.c +++ b/jparse/test_jparse/jnum_gen.c @@ -42,11 +42,6 @@ #define REQUIRED_ARGS (1) /* number of required arguments on the command readline_buf */ #define CHUNK (16) /* allocate CHUNK elements at a time */ -/* - * official jnum_gen version - */ -#define JNUM_GEN_VERSION "1.0.1 2024-03-02" /* format: major.minor YYYY-MM-DD */ - /* * usage message * diff --git a/jparse/test_jparse/jnum_gen.h b/jparse/test_jparse/jnum_gen.h index 04134c6d..b0348bf4 100644 --- a/jparse/test_jparse/jnum_gen.h +++ b/jparse/test_jparse/jnum_gen.h @@ -56,6 +56,10 @@ */ #include "../version.h" +/* + * official jnum_gen version + */ +#define JNUM_GEN_VERSION "1.0.1 2024-03-02" /* format: major.minor YYYY-MM-DD */ /* * jnum_gen tool basename diff --git a/jparse/test_jparse/jparse_test.sh b/jparse/test_jparse/jparse_test.sh index bc331993..d0cda305 100755 --- a/jparse/test_jparse/jparse_test.sh +++ b/jparse/test_jparse/jparse_test.sh @@ -70,7 +70,7 @@ # setup # -export JPARSE_TEST_VERSION="1.0.8 2024-10-08" # version format: major.minor YYYY-MM-DD */ +export JPARSE_TEST_VERSION="1.2.0 2024-10-09" # version format: major.minor YYYY-MM-DD */ export CHK_TEST_FILE="./test_jparse/json_teststr.txt" export CHK_INVALID_TEST_FILE="./test_jparse/json_teststr_fail.txt" export JPARSE_JSON="./jparse.json" diff --git a/jparse/test_jparse/jstr_test.sh b/jparse/test_jparse/jstr_test.sh index 9bc84cca..d631b9aa 100755 --- a/jparse/test_jparse/jstr_test.sh +++ b/jparse/test_jparse/jstr_test.sh @@ -24,7 +24,7 @@ export JSTRDECODE="./jstrdecode" export TEST_FILE="./test_jparse/jstr_test.out" export TEST_FILE2="./test_jparse/jstr_test2.out" export JSTR_TEST_TXT="./test_jparse/jstr_test.txt" -export JSTR_TEST_VERSION="1.0.4 2024-10-08" # version format: major.minor YYYY-MM-DD +export JSTR_TEST_VERSION="1.2.0 2024-10-09" # version format: major.minor YYYY-MM-DD export TOPDIR= export USAGE="usage: $0 [-h] [-V] [-v level] [-e jstrencode] [-d jstrdecode] [-Z topdir] diff --git a/jparse/test_jparse/pr_jparse_test.c b/jparse/test_jparse/pr_jparse_test.c index c981b2a1..14d9443b 100644 --- a/jparse/test_jparse/pr_jparse_test.c +++ b/jparse/test_jparse/pr_jparse_test.c @@ -43,13 +43,6 @@ */ #include "pr_jparse_test.h" - - -/* - * official pr_jparse_test version - */ -#define PR_JPARSE_TEST_VERSION "1.0.3 2024-09-12" /* format: major.minor YYYY-MM-DD */ - /* * definitions */ diff --git a/jparse/test_jparse/pr_jparse_test.h b/jparse/test_jparse/pr_jparse_test.h index 0dc49f3b..54e4cad9 100644 --- a/jparse/test_jparse/pr_jparse_test.h +++ b/jparse/test_jparse/pr_jparse_test.h @@ -56,6 +56,12 @@ */ #include "../version.h" +/* + * official pr_jparse_test version + */ +#define PR_JPARSE_TEST_VERSION "1.2.0 2024-10-09" /* format: major.minor YYYY-MM-DD */ + + /* * pr_jparse_test tool basename */ diff --git a/jparse/test_jparse/prep.sh b/jparse/test_jparse/prep.sh index 0ed87c7f..ee3c0f01 100755 --- a/jparse/test_jparse/prep.sh +++ b/jparse/test_jparse/prep.sh @@ -20,7 +20,7 @@ export FAILURE_SUMMARY= export SKIPPED_SUMMARY= export LOGFILE= -export PREP_VERSION="1.0.3 2024-09-30" +export PREP_VERSION="1.2.0 2024-10-09" export NOTICE_COUNT="0" export USAGE="usage: $0 [-h] [-v level] [-V] [-e] [-o] [-m make] [-M Makefile] [-l logfile] diff --git a/jparse/verge.c b/jparse/verge.c index 245e3e69..8b441577 100644 --- a/jparse/verge.c +++ b/jparse/verge.c @@ -41,11 +41,6 @@ */ #define REQUIRED_ARGS (2) /* number of required arguments on the command line */ -/* - * official verge tool version - */ -#define VERGE_VERSION "1.0.1 2024-03-02" /* format: major.minor YYYY-MM-DD */ - /* * usage message * diff --git a/jparse/verge.h b/jparse/verge.h index c6143680..5634ca74 100644 --- a/jparse/verge.h +++ b/jparse/verge.h @@ -40,6 +40,11 @@ */ #include "version.h" +/* + * official verge tool version + */ +#define VERGE_VERSION "1.2.0 2024-10-09" /* format: major.minor YYYY-MM-DD */ + /* * verge tool basename diff --git a/jparse/version.h b/jparse/version.h index a224a25e..a9897ee0 100644 --- a/jparse/version.h +++ b/jparse/version.h @@ -22,7 +22,7 @@ /* * NOTE: only the repo release version, the jparse tool and the JSON parser - * versions are here. For the version of the other tools, see their source file. + * versions are here. For the version of the other tools, see their header file. */ /* @@ -30,17 +30,17 @@ * * NOTE: this should match the latest Release string in CHANGES.md */ -#define JPARSE_REPO_VERSION "1.0.23 2024-10-08" /* format: major.minor YYYY-MM-DD */ +#define JPARSE_REPO_VERSION "1.2.0 2024-10-09" /* format: major.minor YYYY-MM-DD */ /* * official jparse version */ -#define JPARSE_VERSION "1.1.6 2024-09-07" /* format: major.minor YYYY-MM-DD */ +#define JPARSE_VERSION "1.2.0 2024-10-09" /* format: major.minor YYYY-MM-DD */ /* * official JSON parser version */ -#define JSON_PARSER_VERSION "1.1.7 2024-10-08" /* library version format: major.minor YYYY-MM-DD */ +#define JSON_PARSER_VERSION "1.2.0 2024-10-09" /* library version format: major.minor YYYY-MM-DD */ #endif /* INCLUDE_JPARSE_VERSION_H */