From 633ea5cf470e93622c5f9b1edb30c690944d42e7 Mon Sep 17 00:00:00 2001 From: "Dustin J. Mitchell" Date: Thu, 27 Jan 2022 01:54:00 +0000 Subject: [PATCH] correctly handle invalid utf-8 --- integration-tests/src/bindings_tests/string.c | 39 +++++++++++- integration-tests/src/bindings_tests/uuid.c | 7 +++ lib/src/string.rs | 62 ++++++++++++------- lib/taskchampion.h | 31 ++++++---- 4 files changed, 104 insertions(+), 35 deletions(-) diff --git a/integration-tests/src/bindings_tests/string.c b/integration-tests/src/bindings_tests/string.c index 9926d79bd..0eebec8f7 100644 --- a/integration-tests/src/bindings_tests/string.c +++ b/integration-tests/src/bindings_tests/string.c @@ -20,6 +20,24 @@ static void test_string_cloning(void) { tc_string_free(s); } +// creating cloned strings with invalid utf-8 does not crash +// ..but content is NULL and content_and_len returns the value +static void test_string_cloning_invalid_utf8(void) { + TCString *s = tc_string_clone("\xf0\x28\x8c\x28"); + TEST_ASSERT_NOT_NULL(s); + + // NOTE: this is not one of the cases where invalid UTF-8 results in NULL, + // but that may change. + + size_t len; + const char *buf = tc_string_content_with_len(s, &len); + TEST_ASSERT_NOT_NULL(buf); + TEST_ASSERT_EQUAL(4, len); + TEST_ASSERT_EQUAL_MEMORY("\xf0\x28\x8c\x28", buf, len); + + tc_string_free(s); +} + // borrowed strings echo back their content static void test_string_borrowed_strings_echo(void) { TCString *s = tc_string_borrow("abcdef"); @@ -54,7 +72,8 @@ static void test_string_cloned_strings_echo(void) { tc_string_free(s); } -// tc_string_content returns NULL for strings containing embedded NULs +// tc_clone_with_len can have NULs, and tc_string_content returns NULL for +// strings containing embedded NULs static void test_string_content_null_for_embedded_nuls(void) { TCString *s = tc_string_clone_with_len("ab\0de", 5); TEST_ASSERT_NOT_NULL(s); @@ -69,13 +88,31 @@ static void test_string_content_null_for_embedded_nuls(void) { tc_string_free(s); } +// tc_string_clone_with_len will accept invalid utf-8, but then tc_string_content +// returns NULL. +static void test_string_clone_with_len_invalid_utf8(void) { + TCString *s = tc_string_clone_with_len("\xf0\x28\x8c\x28", 4); + TEST_ASSERT_NOT_NULL(s); + + TEST_ASSERT_NULL(tc_string_content(s)); + + size_t len; + const char *buf = tc_string_content_with_len(s, &len); + TEST_ASSERT_NOT_NULL(buf); + TEST_ASSERT_EQUAL(4, len); + TEST_ASSERT_EQUAL_MEMORY("\xf0\x28\x8c\x28", buf, len); + tc_string_free(s); +} + int string_tests(void) { UNITY_BEGIN(); // each test case above should be named here, in order. RUN_TEST(test_string_creation); RUN_TEST(test_string_cloning); + RUN_TEST(test_string_cloning_invalid_utf8); RUN_TEST(test_string_borrowed_strings_echo); RUN_TEST(test_string_cloned_strings_echo); RUN_TEST(test_string_content_null_for_embedded_nuls); + RUN_TEST(test_string_clone_with_len_invalid_utf8); return UNITY_END(); } diff --git a/integration-tests/src/bindings_tests/uuid.c b/integration-tests/src/bindings_tests/uuid.c index 2e8d0c806..572a85322 100644 --- a/integration-tests/src/bindings_tests/uuid.c +++ b/integration-tests/src/bindings_tests/uuid.c @@ -52,6 +52,12 @@ static void test_uuid_bad_utf8(void) { TEST_ASSERT_FALSE(tc_uuid_from_str(tc_string_borrow(ustr), &u)); } +// converting a string with embedded NUL fails as expected +static void test_uuid_embedded_nul(void) { + TCUuid u; + TEST_ASSERT_FALSE(tc_uuid_from_str(tc_string_clone_with_len("ab\0de", 5), &u)); +} + int uuid_tests(void) { UNITY_BEGIN(); // each test case above should be named here, in order. @@ -61,5 +67,6 @@ int uuid_tests(void) { RUN_TEST(test_uuid_to_str); RUN_TEST(test_uuid_invalid_string_fails); RUN_TEST(test_uuid_bad_utf8); + RUN_TEST(test_uuid_embedded_nul); return UNITY_END(); } diff --git a/lib/src/string.rs b/lib/src/string.rs index 1e3968004..1614a0c02 100644 --- a/lib/src/string.rs +++ b/lib/src/string.rs @@ -1,18 +1,20 @@ use std::ffi::{CStr, CString, OsStr}; use std::os::unix::ffi::OsStrExt; use std::path::PathBuf; +use std::str::Utf8Error; -// TODO: is utf-8-ness always checked? (no) when? - -/// TCString supports passing strings into and out of the TaskChampion API. A string must contain -/// valid UTF-8, and can contain embedded NUL characters. Strings containing such embedded NULs -/// cannot be represented as a "C string" and must be accessed using `tc_string_content_and_len` -/// and `tc_string_clone_with_len`. In general, these two functions should be used for handling -/// arbitrary data, while more convenient forms may be used where embedded NUL characters are -/// impossible, such as in static strings. +/// TCString supports passing strings into and out of the TaskChampion API. A string can contain +/// embedded NUL characters. Strings containing such embedded NULs cannot be represented as a "C +/// string" and must be accessed using `tc_string_content_and_len` and `tc_string_clone_with_len`. +/// In general, these two functions should be used for handling arbitrary data, while more +/// convenient forms may be used where embedded NUL characters are impossible, such as in static +/// strings. +/// +/// Rust expects all strings to be UTF-8, and API functions will fail if given a TCString +/// containing invalid UTF-8. /// /// Unless specified otherwise, functions in this API take ownership of a TCString when it is given -/// as a function argument, and free the string before returning. Thus the following is valid: +/// as a function argument, and free the string before returning. /// /// When a TCString appears as a return value or output argument, it is the responsibility of the /// caller to free the string. @@ -21,6 +23,10 @@ pub enum TCString<'a> { CStr(&'a CStr), String(String), + /// This variant denotes an input string that was not valid UTF-8. This allows reporting this + /// error when the string is read, with the constructor remaining infallible. + InvalidUtf8(Utf8Error, Vec), + /// None is the default value for TCString, but this variant is never seen by C code or by Rust /// code outside of this module. None, @@ -52,15 +58,17 @@ impl<'a> TCString<'a> { TCString::CString(cstring) => cstring.as_c_str().to_str(), TCString::CStr(cstr) => cstr.to_str(), TCString::String(string) => Ok(string.as_ref()), + TCString::InvalidUtf8(e, _) => Err(*e), TCString::None => unreachable!(), } } - pub(crate) fn as_bytes(&self) -> &[u8] { + fn as_bytes(&self) -> &[u8] { match self { TCString::CString(cstring) => cstring.as_bytes(), TCString::CStr(cstr) => cstr.to_bytes(), TCString::String(string) => string.as_bytes(), + TCString::InvalidUtf8(_, data) => data.as_ref(), TCString::None => unreachable!(), } } @@ -118,8 +126,7 @@ pub extern "C" fn tc_string_clone(cstr: *const libc::c_char) -> *mut TCString<'s /// Create a new TCString containing the given string with the given length. This allows creation /// of strings containing embedded NUL characters. As with `tc_string_clone`, the resulting -/// TCString is independent of the passed buffer, which may be reused or freed immediately. If the -/// given string is not valid UTF-8, this function will return NULL. +/// TCString is independent of the passed buffer, which may be reused or freed immediately. #[no_mangle] pub extern "C" fn tc_string_clone_with_len( buf: *const libc::c_char, @@ -127,21 +134,30 @@ pub extern "C" fn tc_string_clone_with_len( ) -> *mut TCString<'static> { let slice = unsafe { std::slice::from_raw_parts(buf as *const u8, len) }; let vec = slice.to_vec(); - if let Ok(string) = String::from_utf8(vec) { - TCString::String(string).return_val() - } else { - std::ptr::null_mut() + // try converting to a string, which is the only variant that can contain embedded NULs. If + // the bytes are not valid utf-8, store that information for reporting later. + match String::from_utf8(vec) { + Ok(string) => TCString::String(string), + Err(e) => { + let (e, vec) = (e.utf8_error(), e.into_bytes()); + TCString::InvalidUtf8(e, vec) + } } + .return_val() } /// Get the content of the string as a regular C string. The given string must not be NULL. The -/// returned value is NULL if the string contains NUL bytes. The returned string is valid until -/// the TCString is freed or passed to another TC API function. +/// returned value is NULL if the string contains NUL bytes or (in some cases) invalid UTF-8. The +/// returned C string is valid until the TCString is freed or passed to another TC API function. +/// +/// In general, prefer [`tc_string_content_with_len`] except when it's certain that the string is +/// valid and NUL-free. /// /// This function does _not_ take ownership of the TCString. #[no_mangle] pub extern "C" fn tc_string_content(tcstring: *mut TCString) -> *const libc::c_char { let tcstring = TCString::from_arg_ref(tcstring); + // if we have a String, we need to consume it and turn it into // a CString. if matches!(tcstring, TCString::String(_)) { @@ -153,7 +169,7 @@ pub extern "C" fn tc_string_content(tcstring: *mut TCString) -> *const libc::c_c Err(nul_err) => { // recover the underlying String from the NulError let original_bytes = nul_err.into_vec(); - // SAFETY: original_bytes just came from a String, so must be valid utf8 + // SAFETY: original_bytes came from a String moments ago, so still valid utf8 let string = unsafe { String::from_utf8_unchecked(original_bytes) }; *tcstring = TCString::String(string); @@ -170,13 +186,14 @@ pub extern "C" fn tc_string_content(tcstring: *mut TCString) -> *const libc::c_c TCString::CString(cstring) => cstring.as_ptr(), TCString::String(_) => unreachable!(), // just converted to CString TCString::CStr(cstr) => cstr.as_ptr(), + TCString::InvalidUtf8(_, _) => std::ptr::null(), TCString::None => unreachable!(), } } /// Get the content of the string as a pointer and length. The given string must not be NULL. -/// This function can return any string, even one including NUL bytes. The returned string is -/// valid until the TCString is freed or passed to another TC API function. +/// This function can return any string, even one including NUL bytes or invalid UTF-8. The +/// returned buffer is valid until the TCString is freed or passed to another TC API function. /// /// This function does _not_ take ownership of the TCString. #[no_mangle] @@ -184,11 +201,14 @@ pub extern "C" fn tc_string_content_with_len( tcstring: *mut TCString, len_out: *mut usize, ) -> *const libc::c_char { + debug_assert!(!tcstring.is_null()); + debug_assert!(!len_out.is_null()); let tcstring = TCString::from_arg_ref(tcstring); let bytes = match tcstring { TCString::CString(cstring) => cstring.as_bytes(), TCString::String(string) => string.as_bytes(), TCString::CStr(cstr) => cstr.to_bytes(), + TCString::InvalidUtf8(_, ref v) => v.as_ref(), TCString::None => unreachable!(), }; unsafe { *len_out = bytes.len() }; diff --git a/lib/taskchampion.h b/lib/taskchampion.h index 466e98994..bf746289c 100644 --- a/lib/taskchampion.h +++ b/lib/taskchampion.h @@ -34,15 +34,18 @@ typedef enum TCStatus { typedef struct TCReplica TCReplica; /** - * TCString supports passing strings into and out of the TaskChampion API. A string must contain - * valid UTF-8, and can contain embedded NUL characters. Strings containing such embedded NULs - * cannot be represented as a "C string" and must be accessed using `tc_string_content_and_len` - * and `tc_string_clone_with_len`. In general, these two functions should be used for handling - * arbitrary data, while more convenient forms may be used where embedded NUL characters are - * impossible, such as in static strings. + * TCString supports passing strings into and out of the TaskChampion API. A string can contain + * embedded NUL characters. Strings containing such embedded NULs cannot be represented as a "C + * string" and must be accessed using `tc_string_content_and_len` and `tc_string_clone_with_len`. + * In general, these two functions should be used for handling arbitrary data, while more + * convenient forms may be used where embedded NUL characters are impossible, such as in static + * strings. + * + * Rust expects all strings to be UTF-8, and API functions will fail if given a TCString + * containing invalid UTF-8. * * Unless specified otherwise, functions in this API take ownership of a TCString when it is given - * as a function argument, and free the string before returning. Thus the following is valid: + * as a function argument, and free the string before returning. * * When a TCString appears as a return value or output argument, it is the responsibility of the * caller to free the string. @@ -154,15 +157,17 @@ struct TCString *tc_string_clone(const char *cstr); /** * Create a new TCString containing the given string with the given length. This allows creation * of strings containing embedded NUL characters. As with `tc_string_clone`, the resulting - * TCString is independent of the passed buffer, which may be reused or freed immediately. If the - * given string is not valid UTF-8, this function will return NULL. + * TCString is independent of the passed buffer, which may be reused or freed immediately. */ struct TCString *tc_string_clone_with_len(const char *buf, size_t len); /** * Get the content of the string as a regular C string. The given string must not be NULL. The - * returned value is NULL if the string contains NUL bytes. The returned string is valid until - * the TCString is freed or passed to another TC API function. + * returned value is NULL if the string contains NUL bytes or (in some cases) invalid UTF-8. The + * returned C string is valid until the TCString is freed or passed to another TC API function. + * + * In general, prefer [`tc_string_content_with_len`] except when it's certain that the string is + * valid and NUL-free. * * This function does _not_ take ownership of the TCString. */ @@ -170,8 +175,8 @@ const char *tc_string_content(struct TCString *tcstring); /** * Get the content of the string as a pointer and length. The given string must not be NULL. - * This function can return any string, even one including NUL bytes. The returned string is - * valid until the TCString is freed or passed to another TC API function. + * This function can return any string, even one including NUL bytes or invalid UTF-8. The + * returned buffer is valid until the TCString is freed or passed to another TC API function. * * This function does _not_ take ownership of the TCString. */