Listing the "Blender Foundation" as copyright holder implied the Blender Foundation holds copyright to files which may include work from many developers. While keeping copyright on headers makes sense for isolated libraries, Blender's own code may be refactored or moved between files in a way that makes the per file copyright holders less meaningful. Copyright references to the "Blender Foundation" have been replaced with "Blender Authors", with the exception of `./extern/` since these this contains libraries which are more isolated, any changed to license headers there can be handled on a case-by-case basis. Some directories in `./intern/` have also been excluded: - `./intern/cycles/` it's own `AUTHORS` file is planned. - `./intern/opensubdiv/`. An "AUTHORS" file has been added, using the chromium projects authors file as a template. Design task: #110784 Ref !110783.
458 lines
16 KiB
C++
458 lines
16 KiB
C++
/* SPDX-FileCopyrightText: 2023 Blender Authors
|
|
*
|
|
* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
|
|
#include "BLI_array.hh"
|
|
#include "BLI_linear_allocator.hh"
|
|
#include "BLI_multi_value_map.hh"
|
|
#include "BLI_span.hh"
|
|
#include "BLI_string.h"
|
|
#include "BLI_string_ref.hh"
|
|
#include "BLI_string_search.hh"
|
|
#include "BLI_string_utf8.h"
|
|
#include "BLI_string_utf8_symbols.h"
|
|
#include "BLI_timeit.hh"
|
|
|
|
/* Right arrow, keep in sync with #UI_MENU_ARROW_SEP in `UI_interface.hh`. */
|
|
#define UI_MENU_ARROW_SEP BLI_STR_UTF8_BLACK_RIGHT_POINTING_SMALL_TRIANGLE
|
|
#define UI_MENU_ARROW_SEP_UNICODE 0x25b8
|
|
|
|
namespace blender::string_search {
|
|
|
|
static int64_t count_utf8_code_points(StringRef str)
|
|
{
|
|
return int64_t(BLI_strnlen_utf8(str.data(), size_t(str.size())));
|
|
}
|
|
|
|
int damerau_levenshtein_distance(StringRef a, StringRef b)
|
|
{
|
|
constexpr int deletion_cost = 1;
|
|
constexpr int insertion_cost = 1;
|
|
constexpr int substitution_cost = 1;
|
|
constexpr int transposition_cost = 1;
|
|
|
|
const int size_a = count_utf8_code_points(a);
|
|
const int size_b = count_utf8_code_points(b);
|
|
|
|
/* Instead of keeping the entire table in memory, only keep three rows. The algorithm only
|
|
* accesses these rows and nothing older.
|
|
* All three rows are usually allocated on the stack. At most a single heap allocation is done,
|
|
* if the reserved stack space is too small. */
|
|
const int row_length = size_b + 1;
|
|
Array<int, 64> rows(row_length * 3);
|
|
|
|
/* Store rows as spans so that it is cheap to swap them. */
|
|
MutableSpan v0{rows.data() + row_length * 0, row_length};
|
|
MutableSpan v1{rows.data() + row_length * 1, row_length};
|
|
MutableSpan v2{rows.data() + row_length * 2, row_length};
|
|
|
|
/* Only v1 needs to be initialized. */
|
|
for (const int i : IndexRange(row_length)) {
|
|
v1[i] = i * insertion_cost;
|
|
}
|
|
|
|
uint32_t prev_unicode_a;
|
|
size_t offset_a = 0;
|
|
for (const int i : IndexRange(size_a)) {
|
|
v2[0] = (i + 1) * deletion_cost;
|
|
|
|
const uint32_t unicode_a = BLI_str_utf8_as_unicode_step(a.data(), a.size(), &offset_a);
|
|
|
|
uint32_t prev_unicode_b;
|
|
size_t offset_b = 0;
|
|
for (const int j : IndexRange(size_b)) {
|
|
const uint32_t unicode_b = BLI_str_utf8_as_unicode_step(b.data(), b.size(), &offset_b);
|
|
|
|
/* Check how costly the different operations would be and pick the cheapest - the one with
|
|
* minimal cost. */
|
|
int new_cost = std::min({v1[j + 1] + deletion_cost,
|
|
v2[j] + insertion_cost,
|
|
v1[j] + (unicode_a != unicode_b) * substitution_cost});
|
|
if (i > 0 && j > 0) {
|
|
if (unicode_a == prev_unicode_b && prev_unicode_a == unicode_b) {
|
|
new_cost = std::min(new_cost, v0[j - 1] + transposition_cost);
|
|
}
|
|
}
|
|
|
|
v2[j + 1] = new_cost;
|
|
prev_unicode_b = unicode_b;
|
|
}
|
|
|
|
/* Swap the three rows, so that the next row can be computed. */
|
|
std::tie(v0, v1, v2) = std::tuple<MutableSpan<int>, MutableSpan<int>, MutableSpan<int>>(
|
|
v1, v2, v0);
|
|
prev_unicode_a = unicode_a;
|
|
}
|
|
|
|
return v1.last();
|
|
}
|
|
|
|
int get_fuzzy_match_errors(StringRef query, StringRef full)
|
|
{
|
|
/* If it is a perfect partial match, return immediately. */
|
|
if (full.find(query) != StringRef::not_found) {
|
|
return 0;
|
|
}
|
|
|
|
const int query_size = count_utf8_code_points(query);
|
|
const int full_size = count_utf8_code_points(full);
|
|
|
|
/* If there is only a single character which is not in the full string, this is not a match. */
|
|
if (query_size == 1) {
|
|
return -1;
|
|
}
|
|
BLI_assert(query.size() >= 2);
|
|
|
|
/* Allow more errors when the size grows larger. */
|
|
const int max_errors = query_size <= 1 ? 0 : query_size / 8 + 1;
|
|
|
|
/* If the query is too large, this cannot be a match. */
|
|
if (query_size - full_size > max_errors) {
|
|
return -1;
|
|
}
|
|
|
|
const uint32_t query_first_unicode = BLI_str_utf8_as_unicode(query.data());
|
|
const uint32_t query_second_unicode = BLI_str_utf8_as_unicode(query.data() +
|
|
BLI_str_utf8_size(query.data()));
|
|
|
|
const char *full_begin = full.begin();
|
|
const char *full_end = full.end();
|
|
|
|
const char *window_begin = full_begin;
|
|
const char *window_end = window_begin;
|
|
const int window_size = std::min(query_size + max_errors, full_size);
|
|
const int extra_chars = window_size - query_size;
|
|
const int max_acceptable_distance = max_errors + extra_chars;
|
|
|
|
for (int i = 0; i < window_size; i++) {
|
|
window_end += BLI_str_utf8_size(window_end);
|
|
}
|
|
|
|
while (true) {
|
|
StringRef window{window_begin, window_end};
|
|
const uint32_t window_begin_unicode = BLI_str_utf8_as_unicode(window_begin);
|
|
int distance = 0;
|
|
/* Expect that the first or second character of the query is correct. This helps to avoid
|
|
* computing the more expensive distance function. */
|
|
if (ELEM(window_begin_unicode, query_first_unicode, query_second_unicode)) {
|
|
distance = damerau_levenshtein_distance(query, window);
|
|
if (distance <= max_acceptable_distance) {
|
|
return distance;
|
|
}
|
|
}
|
|
if (window_end == full_end) {
|
|
return -1;
|
|
}
|
|
|
|
/* When the distance is way too large, we can skip a couple of code points, because the
|
|
* distance can't possibly become as short as required. */
|
|
const int window_offset = std::max(1, distance / 2);
|
|
for (int i = 0; i < window_offset && window_end < full_end; i++) {
|
|
window_begin += BLI_str_utf8_size(window_begin);
|
|
window_end += BLI_str_utf8_size(window_end);
|
|
}
|
|
}
|
|
}
|
|
|
|
static constexpr int unused_word = -1;
|
|
|
|
/**
|
|
* Takes a query and tries to match it with the first characters of some words. For example, "msfv"
|
|
* matches "Mark Sharp from Vertices". Multiple letters of the beginning of a word can be matched
|
|
* as well. For example, "seboulo" matches "select boundary loop". The order of words is important.
|
|
* So "bose" does not match "select boundary". However, individual words can be skipped. For
|
|
* example, "rocc" matches "rotate edge ccw".
|
|
*
|
|
* \return true when the match was successful.
|
|
* If it was successful, the used words are tagged in \a r_word_is_matched.
|
|
*/
|
|
static bool match_word_initials(StringRef query,
|
|
Span<StringRef> words,
|
|
Span<int> word_match_map,
|
|
MutableSpan<bool> r_word_is_matched,
|
|
int start = 0)
|
|
{
|
|
if (start >= words.size()) {
|
|
return false;
|
|
}
|
|
|
|
r_word_is_matched.fill(false);
|
|
|
|
size_t query_index = 0;
|
|
int word_index = start;
|
|
size_t char_index = 0;
|
|
|
|
int first_found_word_index = -1;
|
|
|
|
while (query_index < query.size()) {
|
|
const uint query_unicode = BLI_str_utf8_as_unicode_step(
|
|
query.data(), query.size(), &query_index);
|
|
while (true) {
|
|
/* We are at the end of words, no complete match has been found yet. */
|
|
if (word_index >= words.size()) {
|
|
if (first_found_word_index >= 0) {
|
|
/* Try starting to match at another word. In some cases one can still find matches this
|
|
* way. */
|
|
return match_word_initials(
|
|
query, words, word_match_map, r_word_is_matched, first_found_word_index + 1);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/* Skip words that the caller does not want us to use. */
|
|
if (word_match_map[word_index] != unused_word) {
|
|
word_index++;
|
|
BLI_assert(char_index == 0);
|
|
continue;
|
|
}
|
|
|
|
StringRef word = words[word_index];
|
|
/* Try to match the current character with the current word. */
|
|
if (int(char_index) < word.size()) {
|
|
const uint32_t char_unicode = BLI_str_utf8_as_unicode_step(
|
|
word.data(), word.size(), &char_index);
|
|
if (query_unicode == char_unicode) {
|
|
r_word_is_matched[word_index] = true;
|
|
if (first_found_word_index == -1) {
|
|
first_found_word_index = word_index;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Could not find a match in the current word, go to the beginning of the next word. */
|
|
word_index += 1;
|
|
char_index = 0;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static int get_shortest_word_index_that_startswith(StringRef query,
|
|
Span<StringRef> words,
|
|
Span<int> word_match_map)
|
|
{
|
|
int best_word_size = INT32_MAX;
|
|
int best_word_index = -1;
|
|
for (const int i : words.index_range()) {
|
|
if (word_match_map[i] != unused_word) {
|
|
continue;
|
|
}
|
|
StringRef word = words[i];
|
|
if (word.startswith(query)) {
|
|
if (word.size() < best_word_size) {
|
|
best_word_index = i;
|
|
best_word_size = word.size();
|
|
}
|
|
}
|
|
}
|
|
return best_word_index;
|
|
}
|
|
|
|
static int get_word_index_that_fuzzy_matches(StringRef query,
|
|
Span<StringRef> words,
|
|
Span<int> word_match_map,
|
|
int *r_error_count)
|
|
{
|
|
for (const int i : words.index_range()) {
|
|
if (word_match_map[i] != unused_word) {
|
|
continue;
|
|
}
|
|
StringRef word = words[i];
|
|
const int error_count = get_fuzzy_match_errors(query, word);
|
|
if (error_count >= 0) {
|
|
*r_error_count = error_count;
|
|
return i;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
/**
|
|
* Checks how well the query matches a result. If it does not match, -1 is returned. A positive
|
|
* return value indicates how good the match is. The higher the value, the better the match.
|
|
*/
|
|
static int score_query_against_words(Span<StringRef> query_words, Span<StringRef> result_words)
|
|
{
|
|
/* A mapping from #result_words to #query_words. It's mainly used to determine if a word has been
|
|
* matched already to avoid matching it again. */
|
|
Array<int, 64> word_match_map(result_words.size(), unused_word);
|
|
|
|
/* Start with some high score, because otherwise the final score might become negative. */
|
|
int total_match_score = 1000;
|
|
|
|
for (const int query_word_index : query_words.index_range()) {
|
|
const StringRef query_word = query_words[query_word_index];
|
|
{
|
|
/* Check if any result word begins with the query word. */
|
|
const int word_index = get_shortest_word_index_that_startswith(
|
|
query_word, result_words, word_match_map);
|
|
if (word_index >= 0) {
|
|
total_match_score += 10;
|
|
word_match_map[word_index] = query_word_index;
|
|
continue;
|
|
}
|
|
}
|
|
{
|
|
/* Try to match against word initials. */
|
|
Array<bool, 64> matched_words(result_words.size());
|
|
const bool success = match_word_initials(
|
|
query_word, result_words, word_match_map, matched_words);
|
|
if (success) {
|
|
total_match_score += 3;
|
|
for (const int i : result_words.index_range()) {
|
|
if (matched_words[i]) {
|
|
word_match_map[i] = query_word_index;
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
{
|
|
/* Fuzzy match against words. */
|
|
int error_count = 0;
|
|
const int word_index = get_word_index_that_fuzzy_matches(
|
|
query_word, result_words, word_match_map, &error_count);
|
|
if (word_index >= 0) {
|
|
total_match_score += 3 - error_count;
|
|
word_match_map[word_index] = query_word_index;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* Couldn't match query word with anything. */
|
|
return -1;
|
|
}
|
|
|
|
{
|
|
/* Add penalty when query words are not in the correct order. */
|
|
Vector<int> match_indices;
|
|
for (const int index : word_match_map) {
|
|
if (index != unused_word) {
|
|
match_indices.append(index);
|
|
}
|
|
}
|
|
if (!match_indices.is_empty()) {
|
|
for (const int i : IndexRange(match_indices.size() - 1)) {
|
|
if (match_indices[i] > match_indices[i + 1]) {
|
|
total_match_score -= 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return total_match_score;
|
|
}
|
|
|
|
void extract_normalized_words(StringRef str,
|
|
LinearAllocator<> &allocator,
|
|
Vector<StringRef, 64> &r_words)
|
|
{
|
|
const uint32_t unicode_space = uint32_t(' ');
|
|
const uint32_t unicode_slash = uint32_t('/');
|
|
const uint32_t unicode_right_triangle = UI_MENU_ARROW_SEP_UNICODE;
|
|
|
|
BLI_assert(unicode_space == BLI_str_utf8_as_unicode(" "));
|
|
BLI_assert(unicode_slash == BLI_str_utf8_as_unicode("/"));
|
|
BLI_assert(unicode_right_triangle == BLI_str_utf8_as_unicode(UI_MENU_ARROW_SEP));
|
|
|
|
auto is_separator = [&](uint32_t unicode) {
|
|
return ELEM(unicode, unicode_space, unicode_slash, unicode_right_triangle);
|
|
};
|
|
|
|
/* Make a copy of the string so that we can edit it. */
|
|
StringRef str_copy = allocator.copy_string(str);
|
|
char *mutable_copy = const_cast<char *>(str_copy.data());
|
|
const size_t str_size_in_bytes = size_t(str.size());
|
|
BLI_str_tolower_ascii(mutable_copy, str_size_in_bytes);
|
|
|
|
/* Iterate over all unicode code points to split individual words. */
|
|
bool is_in_word = false;
|
|
size_t word_start = 0;
|
|
size_t offset = 0;
|
|
while (offset < str_size_in_bytes) {
|
|
size_t size = offset;
|
|
uint32_t unicode = BLI_str_utf8_as_unicode_step(str.data(), str.size(), &size);
|
|
size -= offset;
|
|
if (is_separator(unicode)) {
|
|
if (is_in_word) {
|
|
r_words.append(str_copy.substr(int(word_start), int(offset - word_start)));
|
|
is_in_word = false;
|
|
}
|
|
}
|
|
else {
|
|
if (!is_in_word) {
|
|
word_start = offset;
|
|
is_in_word = true;
|
|
}
|
|
}
|
|
offset += size;
|
|
}
|
|
/* If the last word is not followed by a separator, it has to be handled separately. */
|
|
if (is_in_word) {
|
|
r_words.append(str_copy.drop_prefix(int(word_start)));
|
|
}
|
|
}
|
|
|
|
void StringSearchBase::add_impl(const StringRef str, void *user_data, const int weight)
|
|
{
|
|
Vector<StringRef, 64> words;
|
|
string_search::extract_normalized_words(str, allocator_, words);
|
|
items_.append(
|
|
{allocator_.construct_array_copy(words.as_span()), int(str.size()), user_data, weight});
|
|
}
|
|
|
|
Vector<void *> StringSearchBase::query_impl(const StringRef query) const
|
|
{
|
|
LinearAllocator<> allocator;
|
|
Vector<StringRef, 64> query_words;
|
|
string_search::extract_normalized_words(query, allocator, query_words);
|
|
|
|
/* Compute score of every result. */
|
|
MultiValueMap<int, int> result_indices_by_score;
|
|
for (const int result_index : items_.index_range()) {
|
|
const int score = string_search::score_query_against_words(
|
|
query_words, items_[result_index].normalized_words);
|
|
if (score >= 0) {
|
|
result_indices_by_score.add(score, result_index);
|
|
}
|
|
}
|
|
|
|
Vector<int> found_scores;
|
|
for (const int score : result_indices_by_score.keys()) {
|
|
found_scores.append(score);
|
|
}
|
|
std::sort(found_scores.begin(), found_scores.end(), std::greater<>());
|
|
|
|
/* Add results to output vector in correct order. First come the results with the best match
|
|
* score. Results with the same score are in the order they have been added to the search. */
|
|
Vector<int> sorted_result_indices;
|
|
for (const int score : found_scores) {
|
|
MutableSpan<int> indices = result_indices_by_score.lookup(score);
|
|
if (score == found_scores[0] && !query.is_empty()) {
|
|
/* Sort items with best score by length. Shorter items are more likely the ones you are
|
|
* looking for. This also ensures that exact matches will be at the top, even if the query is
|
|
* a sub-string of another item. */
|
|
std::sort(indices.begin(), indices.end(), [&](int a, int b) {
|
|
return items_[a].length < items_[b].length;
|
|
});
|
|
/* Prefer items with larger weights. Use `stable_sort` so that if the weights are the same,
|
|
* the order won't be changed. */
|
|
std::stable_sort(indices.begin(), indices.end(), [&](int a, int b) {
|
|
return items_[a].weight > items_[b].weight;
|
|
});
|
|
}
|
|
sorted_result_indices.extend(indices);
|
|
}
|
|
|
|
Vector<void *> sorted_data(sorted_result_indices.size());
|
|
for (const int i : sorted_result_indices.index_range()) {
|
|
const int result_index = sorted_result_indices[i];
|
|
const SearchItem &item = items_[result_index];
|
|
sorted_data[i] = item.user_data;
|
|
}
|
|
return sorted_data;
|
|
}
|
|
|
|
} // namespace blender::string_search
|