Refactor: path normalize now collapses multiple '..' directories at once

- Avoid a separate memmove call for each `..`.
- Avoid ambiguous path stepping, where separator literals
  needed to be checked to avoid fence post errors.
- Correct & update the doc-string.
This commit is contained in:
Campbell Barton
2023-04-28 11:54:26 +10:00
parent d36af92e5f
commit 3ad82ec5a4
3 changed files with 148 additions and 71 deletions

View File

@@ -321,12 +321,25 @@ void BLI_path_sequence_encode(
char *string, const char *head, const char *tail, unsigned short numlen, int pic);
/**
* Remove redundant characters from \a path and optionally make absolute.
* Remove redundant characters from \a path.
*
* \param path: Can be any input, and this function converts it to a regular full path.
* Also removes garbage from directory paths, like `/../` or double slashes etc.
* The following operations are performed:
* - Redundant path components such as `//`, `/./` & `./` (prefix) are stripped.
* (with the exception of `//` prefix used for blend-file relative paths).
* - `..` are resolved so `<parent>/../<child>/` resolves to `<child>/`.
* Note that the resulting path may begin with `..` if it's relative.
*
* \note \a path isn't protected for max string names.
* Details:
* - The slash direction is expected to be native (see #SEP).
* When calculating a canonical paths you may need to run #BLI_path_slash_native first.
* #BLI_path_cmp_normalized can be used for canonical path comparison.
* - Trailing slashes are left intact (unlike Python which strips them).
* - Handling paths beginning with `..` depends on them being absolute or relative.
* For absolute paths they are removed (e.g. `/../path` becomes `/path`).
* For relative paths they are kept as it's valid to reference paths above a relative location
* such as `//../parent` or `../parent`.
*
* \param path: The path to a file or directory which can be absolute or relative.
*/
void BLI_path_normalize(char *path) ATTR_NONNULL(1);
/**

View File

@@ -115,13 +115,12 @@ void BLI_path_sequence_encode(
void BLI_path_normalize(char *path)
{
const char *path_orig = path;
int path_len;
ptrdiff_t a;
char *start, *eind;
path_len = strlen(path);
int path_len = strlen(path);
/*
* Skip absolute prefix.
* ---------------------
*/
if (path[0] == '/' && path[1] == '/') {
path = path + 2; /* Leave the initial `//` untouched. */
path_len -= 2;
@@ -157,10 +156,14 @@ void BLI_path_normalize(char *path)
}
}
#endif /* WIN32 */
/* Works on WIN32 as well, because the drive component is skipped. */
const bool is_relative = path[0] && (path[0] != SEP);
/*
* Strip redundant path components.
* --------------------------------
*/
/* NOTE(@ideasman42):
* `memmove(start, eind, strlen(eind) + 1);`
* is the same as
@@ -189,7 +192,6 @@ void BLI_path_normalize(char *path)
else {
break;
}
} while (i > 0);
if (i < i_end) {
@@ -200,8 +202,7 @@ void BLI_path_normalize(char *path)
}
}
}
/* Remove redundant `./` prefix, while it could be kept, it confuses the loop below. */
/* Remove redundant `./` prefix as it's redundant & complicates collapsing directories. */
if (is_relative) {
if ((path_len > 2) && (path[0] == '.') && (path[1] == SEP)) {
memmove(path, path + 2, (path_len - 2) + 1);
@@ -209,69 +210,127 @@ void BLI_path_normalize(char *path)
}
}
const ptrdiff_t a_start = is_relative ? 0 : 1;
start = path;
while ((start = strstr(start, SEP_STR ".."))) {
if (!ELEM(start[3], SEP, '\0')) {
start += 3;
continue;
}
/*
* Collapse Parent Directories.
* ----------------------------
*
* Example: `<parent>/<child>/../` -> `<parent>/`
*
* Notes:
* - Leading `../` are skipped as they cannot be collapsed (see `start_base`).
* - Multiple parent directories are handled at once to reduce number of `memmove` calls.
*/
a = (start - path) - 1;
if (a >= a_start) {
/* `<prefix>/<parent>/../<postfix> => <prefix>/<postfix>`. */
eind = start + (4 - 1) /* `strlen("/../") - 1` */; /* Strip "/.." and keep the char after. */
while (a > 0 && path[a] != SEP) { /* Find start of `<parent>`. */
a--;
}
#define IS_PARENT_DIR(p) ((p)[0] == '.' && (p)[1] == '.' && ELEM((p)[2], SEP, '\0'))
if (is_relative && (a == 0) && *eind) {
/* When the path does not start with a slash, don't copy the first `/` to the destination
* as it will make a relative path into an absolute path. */
eind += 1;
}
const size_t eind_len = path_len - (eind - path);
BLI_assert(eind_len == strlen(eind));
/* Only remove the parent if it's not also a `..`. */
if (is_relative && STRPREFIX(path + ((path[a] == SEP) ? a + 1 : a), ".." SEP_STR)) {
start += 3 /* `strlen("/..")` */;
}
else {
start = path + a;
BLI_assert(start < eind);
memmove(start, eind, eind_len + 1);
path_len -= (eind - start);
BLI_assert(strlen(path) == path_len);
BLI_assert(!is_relative || (path[0] != SEP));
}
}
else {
/* Support for odd paths: eg `/../home/me` --> `/home/me`
* this is a valid path in blender but we can't handle this the usual way below
* simply strip this prefix then evaluate the path as usual.
* Python's `os.path.normpath()` does this. */
/* NOTE: previous version of following call used an offset of 3 instead of 4,
* which meant that the `/../home/me` example actually became `home/me`.
* Using offset of 3 gives behavior consistent with the aforementioned
* Python routine. */
eind = start + 3;
const size_t eind_len = path_len - (eind - path);
memmove(start, eind, eind_len + 1);
path_len -= 3;
BLI_assert(strlen(path) == path_len);
BLI_assert(!is_relative || (path[0] != SEP));
}
/* First non prefix path component. */
char *path_first_non_slash_part = path;
while (*path_first_non_slash_part && *path_first_non_slash_part == SEP) {
path_first_non_slash_part++;
}
if (is_relative && path_len == 0 && (path == path_orig)) {
path[0] = '.';
path[1] = '\0';
path_len += 1;
/* Maintain a pointer to the end of leading `..` component.
* Skip leading parent directories because logically they cannot be collapsed. */
char *start_base = path_first_non_slash_part;
while (IS_PARENT_DIR(start_base)) {
start_base += 3;
}
/* It's possible the entire path is made of up `../`,
* in this case there is nothing to do. */
if (start_base < path + path_len) {
/* Step over directories, always starting out on the character after the slash. */
char *start = start_base;
char *start_temp;
while (((start_temp = strstr(start, SEP_STR ".." SEP_STR)) ||
/* Check if the string ends with `/..` & assign when found, else NULL. */
(start_temp = ((start <= &path[path_len - 3]) &&
STREQ(&path[path_len - 3], SEP_STR "..")) ?
&path[path_len - 3] :
NULL))) {
start = start_temp + 1; /* Skip the `/`. */
BLI_assert(start_base != start);
/* Step `end_all` forwards (over all `..`). */
char *end_all = start;
do {
BLI_assert(IS_PARENT_DIR(end_all));
end_all += 3;
BLI_assert(end_all <= path + path_len + 1);
} while (IS_PARENT_DIR(end_all));
/* Step `start` backwards (until `end` meets `end_all` or `start` meets `start_base`). */
char *end = start;
do {
BLI_assert(start_base < start);
BLI_assert(*(start - 1) == SEP);
/* Step `start` backwards one. */
do {
start--;
} while (start_base < start && *(start - 1) != SEP);
BLI_assert(*start != SEP); /* Ensure the loop ran at least once. */
BLI_assert(!IS_PARENT_DIR(start)); /* Clamping by `start_base` prevents this. */
end += 3;
} while ((start != start_base) && (end < end_all));
if (end > path + path_len) {
BLI_assert(*(end - 1) == '\0');
end--;
end_all--;
}
BLI_assert(start < end && start >= start_base);
const size_t start_len = path_len - (end - path);
memmove(start, end, start_len + 1);
path_len -= end - start;
BLI_assert(strlen(path) == path_len);
/* Other `..` directories may have been moved to the front, step `start_base` past them. */
if (UNLIKELY(start == start_base && (end != end_all))) {
start_base += (end_all - end);
start = (start_base < path + path_len) ? start_base : start_base - 1;
}
}
}
BLI_assert(strlen(path) == path_len);
/* Characters before the `start_base` must *only* be `../../../` (multiples of 3). */
BLI_assert((start_base - path_first_non_slash_part) % 3 == 0);
/* All `..` ahead of `start_base` were collapsed (including trailing `/..`). */
BLI_assert(!(start_base < path + path_len) ||
(!strstr(start_base, SEP_STR ".." SEP_STR) &&
!(path_len >= 3 && STREQ(&path[path_len - 3], SEP_STR ".."))));
/*
* Final Prefix Cleanup.
* ---------------------
*/
if (is_relative) {
if (path_len == 0 && (path == path_orig)) {
path[0] = '.';
path[1] = '\0';
path_len = 1;
}
}
else {
/* Support for odd paths: eg `/../home/me` --> `/home/me`
* this is a valid path in blender but we can't handle this the usual way below
* simply strip this prefix then evaluate the path as usual.
* Python's `os.path.normpath()` does this. */
if (start_base != path_first_non_slash_part) {
char *start = start_base > path + path_len ? start_base - 1 : start_base;
/* As long as `start` is set correctly, it should never begin with `../`
* as these directories are expected to be skipped. */
BLI_assert(!IS_PARENT_DIR(start));
const size_t start_len = path_len - (start - path);
memmove(path_first_non_slash_part, start, start_len + 1);
BLI_assert(strlen(start) == start_len);
path_len -= start - path_first_non_slash_part;
BLI_assert(strlen(path) == path_len);
}
}
BLI_assert(strlen(path) == path_len);
#undef IS_PARENT_DIR
}
void BLI_path_normalize_dir(char *dir, size_t dir_maxlen)

View File

@@ -84,11 +84,16 @@ TEST(path_util, Normalize_Dot)
NORMALIZE("/a/./././b/", "/a/b/");
}
/* #BLI_path_normalize: complex "/./" -> "/", "//" -> "/", "./path/../" -> "./". */
TEST(path_util, Normalize_Complex)
TEST(path_util, Normalize_ComplexAbsolute)
{
NORMALIZE("/a/./b/./c/./.././.././", "/a/");
NORMALIZE("/a//.//b//.//c//.//..//.//..//.//", "/a/");
}
TEST(path_util, Normalize_ComplexRelative)
{
NORMALIZE("a/b/c/d/e/f/g/../a/../b/../../c/../../../d/../../../..", ".");
NORMALIZE("a/b/c/d/e/f/g/../a/../../../../b/../../../c/../../d/..", ".");
}
/* #BLI_path_normalize: "//" -> "/" */
TEST(path_util, Normalize_DoubleSlash)
{