BLI_string: add BLI_str_utf8_invalid_substitute

Similar to BLI_str_utf8_invalid_strip except that it substitutes invalid characters and doesn't change the string length. Useful for displaying strings that include invalid UTF8 code-points.
2025-06-26 07:48:05 +00:00
parent 40eaaf089f
commit c5bae85893
3 changed files with 78 additions and 2 deletions
--- a/source/blender/blenlib/BLI_string_utf8.h
+++ b/source/blender/blenlib/BLI_string_utf8.h
@@ -31,13 +31,23 @@ size_t BLI_strncpy_utf8_rlen_unterminated(char *__restrict dst,
 */
 ptrdiff_t BLI_str_utf8_invalid_byte(const char *str, size_t str_len) ATTR_NONNULL(1);
 /**
- * Remove any invalid UTF8 byte (taking into account multi-bytes sequence of course).
+ * Remove any invalid UTF8 byte (taking into account multi-bytes sequences).
 *
 * \param str: a null terminated string.
 * \param str_len: the result of `strlen(str)`.
 * \return number of stripped bytes.
 */
 int BLI_str_utf8_invalid_strip(char *str, size_t str_len) ATTR_NONNULL(1);
+/**
+ * Substitute any invalid UTF8 byte with `substitute` (taking into account multi-bytes sequences).
+ * The length of the string remains unchanged.
+ *
+ * \param str: a null terminated string.
+ * \param str_len: the result of `strlen(str)`.
+ * \return number of bytes replaced.
+ */
+int BLI_str_utf8_invalid_substitute(char *str, size_t str_len, const char substitute)
+    ATTR_NONNULL(1);

 /**
 * \return The size (in bytes) of a single UTF8 char.
--- a/source/blender/blenlib/intern/string_utf8.cc
+++ b/source/blender/blenlib/intern/string_utf8.cc
@@ -307,6 +307,25 @@ int BLI_str_utf8_invalid_strip(char *str, size_t str_len)
  return tot;
 }

+int BLI_str_utf8_invalid_substitute(char *str, size_t str_len, const char substitute)
+{
+  BLI_assert(substitute);
+  ptrdiff_t bad_char;
+  int tot = 0;
+
+  BLI_assert(str[str_len] == '\0');
+
+  while ((bad_char = BLI_str_utf8_invalid_byte(str, str_len)) != -1) {
+    str[bad_char] = substitute;
+    bad_char += 1; /* Step over the bad character. */
+    str += bad_char;
+    str_len -= size_t(bad_char);
+    tot++;
+  }
+
+  return tot;
+}
+
 /**
 * Internal utility for implementing #BLI_strncpy_utf8 / #BLI_strncpy_utf8_rlen.
 *
--- a/source/blender/blenlib/tests/BLI_string_utf8_test.cc
+++ b/source/blender/blenlib/tests/BLI_string_utf8_test.cc
@@ -319,7 +319,7 @@ static const char *utf8_invalid_tests[][3] = {
 /* clang-format on */

 /* BLI_str_utf8_invalid_strip (and indirectly, BLI_str_utf8_invalid_byte). */
-TEST(string, Utf8InvalidBytes)
+TEST(string, Utf8InvalidBytesStrip)
 {
  for (int i = 0; utf8_invalid_tests[i][0] != nullptr; i++) {
    const char *tst = utf8_invalid_tests[i][0];
@@ -337,6 +337,53 @@ TEST(string, Utf8InvalidBytes)
  }
 }

+/* BLI_str_utf8_invalid_substitute (and indirectly, BLI_str_utf8_invalid_byte). */
+TEST(string, Utf8InvalidBytesSubstitute)
+{
+  for (int i = 0; utf8_invalid_tests[i][0] != nullptr; i++) {
+    const char *tst = utf8_invalid_tests[i][0];
+    const int errors_num = int(utf8_invalid_tests[i][2][0]);
+
+    char buff[80];
+    memcpy(buff, tst, sizeof(buff));
+
+    const int errors_found_num = BLI_str_utf8_invalid_substitute(buff, sizeof(buff) - 1, '?');
+
+    EXPECT_EQ(errors_found_num, errors_num);
+    EXPECT_EQ(BLI_str_utf8_invalid_byte(buff, sizeof(buff) - 1), -1);
+    EXPECT_EQ(strlen(buff), sizeof(buff) - 1);
+  }
+}
+
+TEST(string, Utf8InvalidBytesSubstitutePatterns)
+{
+#define TEST_SIMPLE(src_chars, expected_error_count, expected_str) \
+  { \
+    char buff[] = src_chars; \
+    EXPECT_EQ(BLI_str_utf8_invalid_substitute(buff, strlen(buff), '?'), expected_error_count); \
+    EXPECT_STREQ(buff, expected_str); \
+  } \
+  ((void)0)
+
+#define ARRAY_ARG(...) __VA_ARGS__
+
+  /* Empty string. */
+  TEST_SIMPLE(ARRAY_ARG({0x0}), 0, "");
+  /* One good. */
+  TEST_SIMPLE(ARRAY_ARG({'A', 0x0}), 0, "A");
+  /* One bad. */
+  TEST_SIMPLE(ARRAY_ARG({0xff, 0x0}), 1, "?");
+
+  /* Additional patterns. */
+  TEST_SIMPLE(ARRAY_ARG({0xe0, 0xef, 0x0}), 2, "??");
+  TEST_SIMPLE(ARRAY_ARG({0xe0, 'A', 0x0}), 1, "?A");
+  TEST_SIMPLE(ARRAY_ARG({'A', 0xef, 0x0}), 1, "A?");
+  TEST_SIMPLE(ARRAY_ARG({0xe0, 'A', 0xed, 0x0}), 2, "?A?");
+
+#undef ARRAY_ARG
+#undef TEST_SIMPLE
+}
+
 /** \} */

 /* -------------------------------------------------------------------- */