BLI_string: add BLI_str_utf8_invalid_substitute

Similar to BLI_str_utf8_invalid_strip except that it substitutes
invalid characters and doesn't change the string length.

Useful for displaying strings that include invalid UTF8 code-points.
This commit is contained in:
Campbell Barton
2025-06-26 07:48:05 +00:00
parent 40eaaf089f
commit c5bae85893
3 changed files with 78 additions and 2 deletions

View File

@@ -31,13 +31,23 @@ size_t BLI_strncpy_utf8_rlen_unterminated(char *__restrict dst,
*/
ptrdiff_t BLI_str_utf8_invalid_byte(const char *str, size_t str_len) ATTR_NONNULL(1);
/**
* Remove any invalid UTF8 byte (taking into account multi-bytes sequence of course).
* Remove any invalid UTF8 byte (taking into account multi-bytes sequences).
*
* \param str: a null terminated string.
* \param str_len: the result of `strlen(str)`.
* \return number of stripped bytes.
*/
int BLI_str_utf8_invalid_strip(char *str, size_t str_len) ATTR_NONNULL(1);
/**
* Substitute any invalid UTF8 byte with `substitute` (taking into account multi-bytes sequences).
* The length of the string remains unchanged.
*
* \param str: a null terminated string.
* \param str_len: the result of `strlen(str)`.
* \return number of bytes replaced.
*/
int BLI_str_utf8_invalid_substitute(char *str, size_t str_len, const char substitute)
ATTR_NONNULL(1);
/**
* \return The size (in bytes) of a single UTF8 char.

View File

@@ -307,6 +307,25 @@ int BLI_str_utf8_invalid_strip(char *str, size_t str_len)
return tot;
}
int BLI_str_utf8_invalid_substitute(char *str, size_t str_len, const char substitute)
{
BLI_assert(substitute);
ptrdiff_t bad_char;
int tot = 0;
BLI_assert(str[str_len] == '\0');
while ((bad_char = BLI_str_utf8_invalid_byte(str, str_len)) != -1) {
str[bad_char] = substitute;
bad_char += 1; /* Step over the bad character. */
str += bad_char;
str_len -= size_t(bad_char);
tot++;
}
return tot;
}
/**
* Internal utility for implementing #BLI_strncpy_utf8 / #BLI_strncpy_utf8_rlen.
*

View File

@@ -319,7 +319,7 @@ static const char *utf8_invalid_tests[][3] = {
/* clang-format on */
/* BLI_str_utf8_invalid_strip (and indirectly, BLI_str_utf8_invalid_byte). */
TEST(string, Utf8InvalidBytes)
TEST(string, Utf8InvalidBytesStrip)
{
for (int i = 0; utf8_invalid_tests[i][0] != nullptr; i++) {
const char *tst = utf8_invalid_tests[i][0];
@@ -337,6 +337,53 @@ TEST(string, Utf8InvalidBytes)
}
}
/* BLI_str_utf8_invalid_substitute (and indirectly, BLI_str_utf8_invalid_byte). */
TEST(string, Utf8InvalidBytesSubstitute)
{
for (int i = 0; utf8_invalid_tests[i][0] != nullptr; i++) {
const char *tst = utf8_invalid_tests[i][0];
const int errors_num = int(utf8_invalid_tests[i][2][0]);
char buff[80];
memcpy(buff, tst, sizeof(buff));
const int errors_found_num = BLI_str_utf8_invalid_substitute(buff, sizeof(buff) - 1, '?');
EXPECT_EQ(errors_found_num, errors_num);
EXPECT_EQ(BLI_str_utf8_invalid_byte(buff, sizeof(buff) - 1), -1);
EXPECT_EQ(strlen(buff), sizeof(buff) - 1);
}
}
TEST(string, Utf8InvalidBytesSubstitutePatterns)
{
#define TEST_SIMPLE(src_chars, expected_error_count, expected_str) \
{ \
char buff[] = src_chars; \
EXPECT_EQ(BLI_str_utf8_invalid_substitute(buff, strlen(buff), '?'), expected_error_count); \
EXPECT_STREQ(buff, expected_str); \
} \
((void)0)
#define ARRAY_ARG(...) __VA_ARGS__
/* Empty string. */
TEST_SIMPLE(ARRAY_ARG({0x0}), 0, "");
/* One good. */
TEST_SIMPLE(ARRAY_ARG({'A', 0x0}), 0, "A");
/* One bad. */
TEST_SIMPLE(ARRAY_ARG({0xff, 0x0}), 1, "?");
/* Additional patterns. */
TEST_SIMPLE(ARRAY_ARG({0xe0, 0xef, 0x0}), 2, "??");
TEST_SIMPLE(ARRAY_ARG({0xe0, 'A', 0x0}), 1, "?A");
TEST_SIMPLE(ARRAY_ARG({'A', 0xef, 0x0}), 1, "A?");
TEST_SIMPLE(ARRAY_ARG({0xe0, 'A', 0xed, 0x0}), 2, "?A?");
#undef ARRAY_ARG
#undef TEST_SIMPLE
}
/** \} */
/* -------------------------------------------------------------------- */