correctly handle invalid utf-8

This commit is contained in:
Dustin J. Mitchell
2022-01-27 01:54:00 +00:00
parent b5201a28c3
commit 633ea5cf47
4 changed files with 104 additions and 35 deletions

View File

@@ -20,6 +20,24 @@ static void test_string_cloning(void) {
tc_string_free(s); tc_string_free(s);
} }
// creating cloned strings with invalid utf-8 does not crash
// ..but content is NULL and content_and_len returns the value
static void test_string_cloning_invalid_utf8(void) {
TCString *s = tc_string_clone("\xf0\x28\x8c\x28");
TEST_ASSERT_NOT_NULL(s);
// NOTE: this is not one of the cases where invalid UTF-8 results in NULL,
// but that may change.
size_t len;
const char *buf = tc_string_content_with_len(s, &len);
TEST_ASSERT_NOT_NULL(buf);
TEST_ASSERT_EQUAL(4, len);
TEST_ASSERT_EQUAL_MEMORY("\xf0\x28\x8c\x28", buf, len);
tc_string_free(s);
}
// borrowed strings echo back their content // borrowed strings echo back their content
static void test_string_borrowed_strings_echo(void) { static void test_string_borrowed_strings_echo(void) {
TCString *s = tc_string_borrow("abcdef"); TCString *s = tc_string_borrow("abcdef");
@@ -54,7 +72,8 @@ static void test_string_cloned_strings_echo(void) {
tc_string_free(s); tc_string_free(s);
} }
// tc_string_content returns NULL for strings containing embedded NULs // tc_clone_with_len can have NULs, and tc_string_content returns NULL for
// strings containing embedded NULs
static void test_string_content_null_for_embedded_nuls(void) { static void test_string_content_null_for_embedded_nuls(void) {
TCString *s = tc_string_clone_with_len("ab\0de", 5); TCString *s = tc_string_clone_with_len("ab\0de", 5);
TEST_ASSERT_NOT_NULL(s); TEST_ASSERT_NOT_NULL(s);
@@ -69,13 +88,31 @@ static void test_string_content_null_for_embedded_nuls(void) {
tc_string_free(s); tc_string_free(s);
} }
// tc_string_clone_with_len will accept invalid utf-8, but then tc_string_content
// returns NULL.
static void test_string_clone_with_len_invalid_utf8(void) {
TCString *s = tc_string_clone_with_len("\xf0\x28\x8c\x28", 4);
TEST_ASSERT_NOT_NULL(s);
TEST_ASSERT_NULL(tc_string_content(s));
size_t len;
const char *buf = tc_string_content_with_len(s, &len);
TEST_ASSERT_NOT_NULL(buf);
TEST_ASSERT_EQUAL(4, len);
TEST_ASSERT_EQUAL_MEMORY("\xf0\x28\x8c\x28", buf, len);
tc_string_free(s);
}
int string_tests(void) { int string_tests(void) {
UNITY_BEGIN(); UNITY_BEGIN();
// each test case above should be named here, in order. // each test case above should be named here, in order.
RUN_TEST(test_string_creation); RUN_TEST(test_string_creation);
RUN_TEST(test_string_cloning); RUN_TEST(test_string_cloning);
RUN_TEST(test_string_cloning_invalid_utf8);
RUN_TEST(test_string_borrowed_strings_echo); RUN_TEST(test_string_borrowed_strings_echo);
RUN_TEST(test_string_cloned_strings_echo); RUN_TEST(test_string_cloned_strings_echo);
RUN_TEST(test_string_content_null_for_embedded_nuls); RUN_TEST(test_string_content_null_for_embedded_nuls);
RUN_TEST(test_string_clone_with_len_invalid_utf8);
return UNITY_END(); return UNITY_END();
} }

View File

@@ -52,6 +52,12 @@ static void test_uuid_bad_utf8(void) {
TEST_ASSERT_FALSE(tc_uuid_from_str(tc_string_borrow(ustr), &u)); TEST_ASSERT_FALSE(tc_uuid_from_str(tc_string_borrow(ustr), &u));
} }
// converting a string with embedded NUL fails as expected
static void test_uuid_embedded_nul(void) {
TCUuid u;
TEST_ASSERT_FALSE(tc_uuid_from_str(tc_string_clone_with_len("ab\0de", 5), &u));
}
int uuid_tests(void) { int uuid_tests(void) {
UNITY_BEGIN(); UNITY_BEGIN();
// each test case above should be named here, in order. // each test case above should be named here, in order.
@@ -61,5 +67,6 @@ int uuid_tests(void) {
RUN_TEST(test_uuid_to_str); RUN_TEST(test_uuid_to_str);
RUN_TEST(test_uuid_invalid_string_fails); RUN_TEST(test_uuid_invalid_string_fails);
RUN_TEST(test_uuid_bad_utf8); RUN_TEST(test_uuid_bad_utf8);
RUN_TEST(test_uuid_embedded_nul);
return UNITY_END(); return UNITY_END();
} }

View File

@@ -1,18 +1,20 @@
use std::ffi::{CStr, CString, OsStr}; use std::ffi::{CStr, CString, OsStr};
use std::os::unix::ffi::OsStrExt; use std::os::unix::ffi::OsStrExt;
use std::path::PathBuf; use std::path::PathBuf;
use std::str::Utf8Error;
// TODO: is utf-8-ness always checked? (no) when? /// TCString supports passing strings into and out of the TaskChampion API. A string can contain
/// embedded NUL characters. Strings containing such embedded NULs cannot be represented as a "C
/// TCString supports passing strings into and out of the TaskChampion API. A string must contain /// string" and must be accessed using `tc_string_content_and_len` and `tc_string_clone_with_len`.
/// valid UTF-8, and can contain embedded NUL characters. Strings containing such embedded NULs /// In general, these two functions should be used for handling arbitrary data, while more
/// cannot be represented as a "C string" and must be accessed using `tc_string_content_and_len` /// convenient forms may be used where embedded NUL characters are impossible, such as in static
/// and `tc_string_clone_with_len`. In general, these two functions should be used for handling /// strings.
/// arbitrary data, while more convenient forms may be used where embedded NUL characters are ///
/// impossible, such as in static strings. /// Rust expects all strings to be UTF-8, and API functions will fail if given a TCString
/// containing invalid UTF-8.
/// ///
/// Unless specified otherwise, functions in this API take ownership of a TCString when it is given /// Unless specified otherwise, functions in this API take ownership of a TCString when it is given
/// as a function argument, and free the string before returning. Thus the following is valid: /// as a function argument, and free the string before returning.
/// ///
/// When a TCString appears as a return value or output argument, it is the responsibility of the /// When a TCString appears as a return value or output argument, it is the responsibility of the
/// caller to free the string. /// caller to free the string.
@@ -21,6 +23,10 @@ pub enum TCString<'a> {
CStr(&'a CStr), CStr(&'a CStr),
String(String), String(String),
/// This variant denotes an input string that was not valid UTF-8. This allows reporting this
/// error when the string is read, with the constructor remaining infallible.
InvalidUtf8(Utf8Error, Vec<u8>),
/// None is the default value for TCString, but this variant is never seen by C code or by Rust /// None is the default value for TCString, but this variant is never seen by C code or by Rust
/// code outside of this module. /// code outside of this module.
None, None,
@@ -52,15 +58,17 @@ impl<'a> TCString<'a> {
TCString::CString(cstring) => cstring.as_c_str().to_str(), TCString::CString(cstring) => cstring.as_c_str().to_str(),
TCString::CStr(cstr) => cstr.to_str(), TCString::CStr(cstr) => cstr.to_str(),
TCString::String(string) => Ok(string.as_ref()), TCString::String(string) => Ok(string.as_ref()),
TCString::InvalidUtf8(e, _) => Err(*e),
TCString::None => unreachable!(), TCString::None => unreachable!(),
} }
} }
pub(crate) fn as_bytes(&self) -> &[u8] { fn as_bytes(&self) -> &[u8] {
match self { match self {
TCString::CString(cstring) => cstring.as_bytes(), TCString::CString(cstring) => cstring.as_bytes(),
TCString::CStr(cstr) => cstr.to_bytes(), TCString::CStr(cstr) => cstr.to_bytes(),
TCString::String(string) => string.as_bytes(), TCString::String(string) => string.as_bytes(),
TCString::InvalidUtf8(_, data) => data.as_ref(),
TCString::None => unreachable!(), TCString::None => unreachable!(),
} }
} }
@@ -118,8 +126,7 @@ pub extern "C" fn tc_string_clone(cstr: *const libc::c_char) -> *mut TCString<'s
/// Create a new TCString containing the given string with the given length. This allows creation /// Create a new TCString containing the given string with the given length. This allows creation
/// of strings containing embedded NUL characters. As with `tc_string_clone`, the resulting /// of strings containing embedded NUL characters. As with `tc_string_clone`, the resulting
/// TCString is independent of the passed buffer, which may be reused or freed immediately. If the /// TCString is independent of the passed buffer, which may be reused or freed immediately.
/// given string is not valid UTF-8, this function will return NULL.
#[no_mangle] #[no_mangle]
pub extern "C" fn tc_string_clone_with_len( pub extern "C" fn tc_string_clone_with_len(
buf: *const libc::c_char, buf: *const libc::c_char,
@@ -127,21 +134,30 @@ pub extern "C" fn tc_string_clone_with_len(
) -> *mut TCString<'static> { ) -> *mut TCString<'static> {
let slice = unsafe { std::slice::from_raw_parts(buf as *const u8, len) }; let slice = unsafe { std::slice::from_raw_parts(buf as *const u8, len) };
let vec = slice.to_vec(); let vec = slice.to_vec();
if let Ok(string) = String::from_utf8(vec) { // try converting to a string, which is the only variant that can contain embedded NULs. If
TCString::String(string).return_val() // the bytes are not valid utf-8, store that information for reporting later.
} else { match String::from_utf8(vec) {
std::ptr::null_mut() Ok(string) => TCString::String(string),
Err(e) => {
let (e, vec) = (e.utf8_error(), e.into_bytes());
TCString::InvalidUtf8(e, vec)
}
} }
.return_val()
} }
/// Get the content of the string as a regular C string. The given string must not be NULL. The /// Get the content of the string as a regular C string. The given string must not be NULL. The
/// returned value is NULL if the string contains NUL bytes. The returned string is valid until /// returned value is NULL if the string contains NUL bytes or (in some cases) invalid UTF-8. The
/// the TCString is freed or passed to another TC API function. /// returned C string is valid until the TCString is freed or passed to another TC API function.
///
/// In general, prefer [`tc_string_content_with_len`] except when it's certain that the string is
/// valid and NUL-free.
/// ///
/// This function does _not_ take ownership of the TCString. /// This function does _not_ take ownership of the TCString.
#[no_mangle] #[no_mangle]
pub extern "C" fn tc_string_content(tcstring: *mut TCString) -> *const libc::c_char { pub extern "C" fn tc_string_content(tcstring: *mut TCString) -> *const libc::c_char {
let tcstring = TCString::from_arg_ref(tcstring); let tcstring = TCString::from_arg_ref(tcstring);
// if we have a String, we need to consume it and turn it into // if we have a String, we need to consume it and turn it into
// a CString. // a CString.
if matches!(tcstring, TCString::String(_)) { if matches!(tcstring, TCString::String(_)) {
@@ -153,7 +169,7 @@ pub extern "C" fn tc_string_content(tcstring: *mut TCString) -> *const libc::c_c
Err(nul_err) => { Err(nul_err) => {
// recover the underlying String from the NulError // recover the underlying String from the NulError
let original_bytes = nul_err.into_vec(); let original_bytes = nul_err.into_vec();
// SAFETY: original_bytes just came from a String, so must be valid utf8 // SAFETY: original_bytes came from a String moments ago, so still valid utf8
let string = unsafe { String::from_utf8_unchecked(original_bytes) }; let string = unsafe { String::from_utf8_unchecked(original_bytes) };
*tcstring = TCString::String(string); *tcstring = TCString::String(string);
@@ -170,13 +186,14 @@ pub extern "C" fn tc_string_content(tcstring: *mut TCString) -> *const libc::c_c
TCString::CString(cstring) => cstring.as_ptr(), TCString::CString(cstring) => cstring.as_ptr(),
TCString::String(_) => unreachable!(), // just converted to CString TCString::String(_) => unreachable!(), // just converted to CString
TCString::CStr(cstr) => cstr.as_ptr(), TCString::CStr(cstr) => cstr.as_ptr(),
TCString::InvalidUtf8(_, _) => std::ptr::null(),
TCString::None => unreachable!(), TCString::None => unreachable!(),
} }
} }
/// Get the content of the string as a pointer and length. The given string must not be NULL. /// Get the content of the string as a pointer and length. The given string must not be NULL.
/// This function can return any string, even one including NUL bytes. The returned string is /// This function can return any string, even one including NUL bytes or invalid UTF-8. The
/// valid until the TCString is freed or passed to another TC API function. /// returned buffer is valid until the TCString is freed or passed to another TC API function.
/// ///
/// This function does _not_ take ownership of the TCString. /// This function does _not_ take ownership of the TCString.
#[no_mangle] #[no_mangle]
@@ -184,11 +201,14 @@ pub extern "C" fn tc_string_content_with_len(
tcstring: *mut TCString, tcstring: *mut TCString,
len_out: *mut usize, len_out: *mut usize,
) -> *const libc::c_char { ) -> *const libc::c_char {
debug_assert!(!tcstring.is_null());
debug_assert!(!len_out.is_null());
let tcstring = TCString::from_arg_ref(tcstring); let tcstring = TCString::from_arg_ref(tcstring);
let bytes = match tcstring { let bytes = match tcstring {
TCString::CString(cstring) => cstring.as_bytes(), TCString::CString(cstring) => cstring.as_bytes(),
TCString::String(string) => string.as_bytes(), TCString::String(string) => string.as_bytes(),
TCString::CStr(cstr) => cstr.to_bytes(), TCString::CStr(cstr) => cstr.to_bytes(),
TCString::InvalidUtf8(_, ref v) => v.as_ref(),
TCString::None => unreachable!(), TCString::None => unreachable!(),
}; };
unsafe { *len_out = bytes.len() }; unsafe { *len_out = bytes.len() };

View File

@@ -34,15 +34,18 @@ typedef enum TCStatus {
typedef struct TCReplica TCReplica; typedef struct TCReplica TCReplica;
/** /**
* TCString supports passing strings into and out of the TaskChampion API. A string must contain * TCString supports passing strings into and out of the TaskChampion API. A string can contain
* valid UTF-8, and can contain embedded NUL characters. Strings containing such embedded NULs * embedded NUL characters. Strings containing such embedded NULs cannot be represented as a "C
* cannot be represented as a "C string" and must be accessed using `tc_string_content_and_len` * string" and must be accessed using `tc_string_content_and_len` and `tc_string_clone_with_len`.
* and `tc_string_clone_with_len`. In general, these two functions should be used for handling * In general, these two functions should be used for handling arbitrary data, while more
* arbitrary data, while more convenient forms may be used where embedded NUL characters are * convenient forms may be used where embedded NUL characters are impossible, such as in static
* impossible, such as in static strings. * strings.
*
* Rust expects all strings to be UTF-8, and API functions will fail if given a TCString
* containing invalid UTF-8.
* *
* Unless specified otherwise, functions in this API take ownership of a TCString when it is given * Unless specified otherwise, functions in this API take ownership of a TCString when it is given
* as a function argument, and free the string before returning. Thus the following is valid: * as a function argument, and free the string before returning.
* *
* When a TCString appears as a return value or output argument, it is the responsibility of the * When a TCString appears as a return value or output argument, it is the responsibility of the
* caller to free the string. * caller to free the string.
@@ -154,15 +157,17 @@ struct TCString *tc_string_clone(const char *cstr);
/** /**
* Create a new TCString containing the given string with the given length. This allows creation * Create a new TCString containing the given string with the given length. This allows creation
* of strings containing embedded NUL characters. As with `tc_string_clone`, the resulting * of strings containing embedded NUL characters. As with `tc_string_clone`, the resulting
* TCString is independent of the passed buffer, which may be reused or freed immediately. If the * TCString is independent of the passed buffer, which may be reused or freed immediately.
* given string is not valid UTF-8, this function will return NULL.
*/ */
struct TCString *tc_string_clone_with_len(const char *buf, size_t len); struct TCString *tc_string_clone_with_len(const char *buf, size_t len);
/** /**
* Get the content of the string as a regular C string. The given string must not be NULL. The * Get the content of the string as a regular C string. The given string must not be NULL. The
* returned value is NULL if the string contains NUL bytes. The returned string is valid until * returned value is NULL if the string contains NUL bytes or (in some cases) invalid UTF-8. The
* the TCString is freed or passed to another TC API function. * returned C string is valid until the TCString is freed or passed to another TC API function.
*
* In general, prefer [`tc_string_content_with_len`] except when it's certain that the string is
* valid and NUL-free.
* *
* This function does _not_ take ownership of the TCString. * This function does _not_ take ownership of the TCString.
*/ */
@@ -170,8 +175,8 @@ const char *tc_string_content(struct TCString *tcstring);
/** /**
* Get the content of the string as a pointer and length. The given string must not be NULL. * Get the content of the string as a pointer and length. The given string must not be NULL.
* This function can return any string, even one including NUL bytes. The returned string is * This function can return any string, even one including NUL bytes or invalid UTF-8. The
* valid until the TCString is freed or passed to another TC API function. * returned buffer is valid until the TCString is freed or passed to another TC API function.
* *
* This function does _not_ take ownership of the TCString. * This function does _not_ take ownership of the TCString.
*/ */