[Asterisk-code-review] res_pjsip: Mask invalid UTF-8 sequences from callerid name (asterisk[18])

Fri Feb 17 10:59:00 CST 2023

George Joseph has uploaded this change for review. ( https://gerrit.asterisk.org/c/asterisk/+/19898 )


Change subject: res_pjsip: Mask invalid UTF-8 sequences from callerid name
......................................................................

res_pjsip: Mask invalid UTF-8 sequences from callerid name

* Added a new function ast_utf8_mask_invalid_chars() to
  json.c that copies a string replacing any invalid UTF-8
  sequences with a specified mask character.  For example:
  "abc\xffdef" becomes "abc?def".

* Updated res_pjsip:set_id_from_hdr() to use
  ast_utf8_mask_invalid_chars and print a warning if any
  invalid sequences were found during the copy.

* Updated stasis_channels:ast_channel_publish_varset to use
  ast_utf8_mask_invalid_chars and print a warning if any
  invalid sequences were found during the copy.

ASTERISK-27830

Change-Id: I4ffbdb19c80bf0efc675d40078a3ca4f85c567d8
---
M include/asterisk/json.h
M include/asterisk/utf8.h
M main/json.c
M main/stasis_channels.c
M res/res_pjsip.c
M tests/test_json.c
6 files changed, 162 insertions(+), 7 deletions(-)



  git pull ssh://gerrit.asterisk.org:29418/asterisk refs/changes/98/19898/1

diff --git a/include/asterisk/json.h b/include/asterisk/json.h
index 5edc3a9..5f41e69 100644
--- a/include/asterisk/json.h
+++ b/include/asterisk/json.h
@@ -20,6 +20,7 @@
 #define _ASTERISK_JSON_H
 
 #include "asterisk/netsock2.h"
+#include "utf8.h"
 
 /*! \file
  *
@@ -193,6 +194,26 @@
 /*!@{*/
 
 /*!
+ * \brief Copy a string safely masking any invalid UTF-8 sequences
+ *
+ * This is similar to \ref ast_copy_string, but it will only copy valid UTF-8
+ * sequences from the source string into the destination buffer. Unlike
+ * \ref ast_utf8_copy_string however, if an invalid sequence is encountered,
+ * it's masked with the supplied character and copying continues.
+ *
+ * \param dst       The destination buffer.
+ * \param dst_size  The size of the dst buffer including space for the NULL terminator
+ * \param src       The source string
+ * \param src_len   The number of characters to copy
+ * \param mask      The charcter to use to mask the invalid ones
+ *
+ * \return The \ref ast_utf8_validation_result indicating whether there
+ *         were any invalid characters in the string.
+ */
+enum ast_utf8_validation_result ast_utf8_mask_invalid_chars(char *dst,
+	size_t dst_size, const char *str, size_t src_len, const char mask);
+
+/*!
  * \brief Check the string of the given length for UTF-8 format.
  * \since 13.12.0
  *
diff --git a/include/asterisk/utf8.h b/include/asterisk/utf8.h
index 02ec800..7638637 100644
--- a/include/asterisk/utf8.h
+++ b/include/asterisk/utf8.h
@@ -93,6 +93,14 @@
 	 * to feed into the validator the UTF-8 sequence is invalid.
 	 */
 	AST_UTF8_UNKNOWN,
+
+	/*! \brief Outright failure
+	 *
+	 * Some condition prevented the validator or copy function
+	 * from operating all.  For instance, it was passed a NULL
+	 * pointer or the output buffer was too small.
+	 */
+	AST_UTF8_FAIL,
 };
 
 /*!
diff --git a/main/json.c b/main/json.c
index 616b12e..66676a2 100644
--- a/main/json.c
+++ b/main/json.c
@@ -230,6 +230,46 @@
 	return str ? ast_json_utf8_check_len(str, strlen(str)) : 0;
 }
 
+enum ast_utf8_validation_result ast_utf8_mask_invalid_chars(char *dst,
+	size_t dst_size, const char *src, size_t src_count, const char mask)
+{
+	size_t pos;
+	size_t count;
+	enum ast_utf8_validation_result result = AST_UTF8_VALID;
+
+	if (!src || ! dst || dst_size < src_count + 1) {
+		return AST_UTF8_FAIL;
+	}
+
+	for (pos = 0; pos < src_count; ) {
+		count = json_utf8_check_first(src[pos]);
+		ast_debug(9, "first pos: %2ld count: %2ld char: '%c' 0x%02x", pos, count, src[pos], src[pos] & 0xFF);
+		if (count == 0) {
+			dst[pos] = mask;
+			ast_debug(9, " not good  dst: '%c' 0x%02x\n", dst[pos], dst[pos]);
+			pos++;
+			result = AST_UTF8_INVALID;
+		} else if (count > 1) {
+			if (!json_utf8_check_full(&src[pos], count)) {
+				dst[pos] = mask;
+				ast_debug(9, " fail full dst: '%c' 0x%02x\n", dst[pos], dst[pos]);
+				pos++;
+				result = AST_UTF8_INVALID;
+			} else {
+				strncpy(&dst[pos], &src[pos], count);
+				ast_debug(9, " good seq  dst: '%.*s'\n", (int)count, &dst[pos]);
+				pos+=count;
+			}
+		} else {
+			dst[pos] = src[pos];
+			ast_debug(9, " good char dst: '%c' 0x%02x\n", dst[pos], dst[pos]);
+			pos++;
+		}
+	}
+	dst[pos] = '\0';
+	return result;
+}
+
 struct ast_json *ast_json_true(void)
 {
 	return (struct ast_json *)json_true();
@@ -637,7 +677,7 @@
 		"exten", exten,
 		"priority", priority != -1 ? ast_json_integer_create(priority) : ast_json_null(),
 		"app_name", app_name,
-		"app_data", app_data
+		"app_data", AST_JSON_UTF8_VALIDATE(app_data)
 		);
 }
 
diff --git a/main/stasis_channels.c b/main/stasis_channels.c
index d373f6a..696e68f 100644
--- a/main/stasis_channels.c
+++ b/main/stasis_channels.c
@@ -1154,13 +1154,23 @@
 void ast_channel_publish_varset(struct ast_channel *chan, const char *name, const char *value)
 {
 	struct ast_json *blob;
+	enum ast_utf8_validation_result result;
+	char *new_value;
 
 	ast_assert(name != NULL);
 	ast_assert(value != NULL);
 
+	new_value = ast_strdupa(value);
+	result = ast_utf8_mask_invalid_chars(new_value, strlen(new_value) + 1,
+		value, strlen(value), '?');
+
+	if (result != AST_UTF8_VALID) {
+		ast_log(LOG_WARNING, "%s: Variable '%s' has invalid UTF-8 value '%s'. "
+			" Replacing with '%s'", ast_channel_name(chan), name, value, new_value);
+	}
 	blob = ast_json_pack("{s: s, s: s}",
 			     "variable", name,
-			     "value", value);
+			     "value", new_value);
 	if (!blob) {
 		ast_log(LOG_ERROR, "Error creating message\n");
 		return;
diff --git a/res/res_pjsip.c b/res/res_pjsip.c
index 8273847..7374a4e 100644
--- a/res/res_pjsip.c
+++ b/res/res_pjsip.c
@@ -47,6 +47,7 @@
 #include "asterisk/test.h"
 #include "asterisk/res_pjsip_presence_xml.h"
 #include "asterisk/res_pjproject.h"
+#include "asterisk/json.h"
 
 /*** MODULEINFO
 	<depend>pjproject</depend>
@@ -2463,10 +2464,9 @@
 	char cid_num[AST_CHANNEL_NAME];
 	pjsip_name_addr *id_name_addr = (pjsip_name_addr *) hdr->uri;
 	char *semi;
+	enum ast_utf8_validation_result result;
 
-	ast_copy_pj_str(cid_name, &id_name_addr->display, sizeof(cid_name));
 	ast_copy_pj_str(cid_num, ast_sip_pjsip_uri_get_username(hdr->uri), sizeof(cid_num));
-
 	/* Always truncate caller-id number at a semicolon. */
 	semi = strchr(cid_num, ';');
 	if (semi) {
@@ -2484,6 +2484,16 @@
 		*semi = '\0';
 	}
 
+	result = ast_utf8_mask_invalid_chars(cid_name, sizeof(cid_name),
+		id_name_addr->display.ptr, id_name_addr->display.slen, '?');
+
+	if (result != AST_UTF8_VALID) {
+		ast_log(LOG_WARNING, "CallerID Name '" PJSTR_PRINTF_SPEC "' for number '%s' has invalid UTF-8 characters. "
+			" Replaced with '%s'",
+			PJSTR_PRINTF_VAR(id_name_addr->display), cid_num,
+			cid_name);
+	}
+
 	ast_free(id->name.str);
 	id->name.str = ast_strdup(cid_name);
 	if (!ast_strlen_zero(cid_name)) {
diff --git a/tests/test_json.c b/tests/test_json.c
index e1fc0ba..297bc7c 100644
--- a/tests/test_json.c
+++ b/tests/test_json.c
@@ -1718,14 +1718,14 @@
 		break;
 	}
 
-	expected = ast_json_pack("{s: o, s: o, s: o, s: o, s: o}",
+	expected = ast_json_pack("{s: o, s: o, s: o, s: o, s: s}",
 				 "context", ast_json_null(),
 				 "exten", ast_json_null(),
 				 "priority", ast_json_null(),
 				 "app_name", ast_json_null(),
-				 "app_data", ast_json_null()
+				 "app_data", ""
 				 );
-	uut = ast_json_dialplan_cep_app(NULL, NULL, -1, NULL, NULL);
+	uut = ast_json_dialplan_cep_app(NULL, NULL, -1, NULL, "");
 	ast_test_validate(test, ast_json_equal(expected, uut));
 
 	ast_json_unref(expected);
@@ -1743,6 +1743,46 @@
 	return AST_TEST_PASS;
 }
 
+static int test_copy_and_mask(const char *src, const char *cmp)
+{
+	char *dst = ast_strdupa(src);
+	ast_utf8_mask_invalid_chars(dst, strlen(dst) + 1,
+		src, strlen(src), '?');
+	if (strcmp(dst, cmp) != 0) {
+		ast_log(LOG_ERROR, "Invalid result. In: '%s', Out: '%s', Expected: '%s'\n",
+			src, dst, cmp);
+		return 0;
+	}
+	return 1;
+}
+
+AST_TEST_DEFINE(test_utf8_mask_invalid_chars)
+{
+	switch (cmd) {
+	case TEST_INIT:
+		info->name = "mask_string";
+		info->category = CATEGORY;
+		info->summary = "Test ast_utf8_mask_invalid_chars";
+		info->description =
+			"Tests UTF-8 string copying/masking code.";
+		return AST_TEST_NOT_RUN;
+	case TEST_EXECUTE:
+		break;
+	}
+
+	ast_test_validate(test, test_copy_and_mask("Asterisk", "Asterisk"));
+	ast_test_validate(test, test_copy_and_mask("Asterisk \xc2", "Asterisk ?"));
+	ast_test_validate(test, test_copy_and_mask("Asterisk \xc2\xae", "Asterisk \xc2\xae"));
+	ast_test_validate(test, test_copy_and_mask("Asterisk \xc0\x8a",  "Asterisk ??"));
+	ast_test_validate(test, test_copy_and_mask("\xce\xbb xyz", "\xce\xbb xyz"));
+	ast_test_validate(test, test_copy_and_mask("\xe0\xc2\xb0xyz", "?\xc2\xb0xyz"));
+	ast_test_validate(test, test_copy_and_mask("\xe0\xc2\xf4\xb0xyz", "????xyz"));
+	ast_test_validate(test, test_copy_and_mask("\xe0\xc2\xb0xyz\xc2", "?\xc2\xb0xyz?"));
+
+	return AST_TEST_PASS;
+}
+
+
 static int unload_module(void)
 {
 	AST_TEST_UNREGISTER(json_test_false);
@@ -1797,6 +1837,7 @@
 	AST_TEST_UNREGISTER(json_test_name_number);
 	AST_TEST_UNREGISTER(json_test_timeval);
 	AST_TEST_UNREGISTER(json_test_cep);
+	AST_TEST_UNREGISTER(test_utf8_mask_invalid_chars);
 	return 0;
 }
 
@@ -1854,6 +1895,7 @@
 	AST_TEST_REGISTER(json_test_name_number);
 	AST_TEST_REGISTER(json_test_timeval);
 	AST_TEST_REGISTER(json_test_cep);
+	AST_TEST_REGISTER(test_utf8_mask_invalid_chars);
 
 	ast_test_register_init(CATEGORY, json_test_init);
 	ast_test_register_cleanup(CATEGORY, json_test_cleanup);

-- 
To view, visit https://gerrit.asterisk.org/c/asterisk/+/19898
To unsubscribe, or for help writing mail filters, visit https://gerrit.asterisk.org/settings

Gerrit-Project: asterisk
Gerrit-Branch: 18
Gerrit-Change-Id: I4ffbdb19c80bf0efc675d40078a3ca4f85c567d8
Gerrit-Change-Number: 19898
Gerrit-PatchSet: 1
Gerrit-Owner: George Joseph <gjoseph at digium.com>
Gerrit-MessageType: newchange
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.digium.com/pipermail/asterisk-code-review/attachments/20230217/c09c8c26/attachment-0001.html>