[asterisk-commits] murf: branch murf/utf8-whatif r89703 - in /team/murf/utf8-whatif: include/ast...

Tue Nov 27 13:45:41 CST 2007

Author: murf
Date: Tue Nov 27 13:45:40 2007
New Revision: 89703

URL: http://svn.digium.com/view/asterisk?view=rev&rev=89703
Log:
Eh, haven't tested any of this. Wrote it Saturday, I think

Added:
    team/murf/utf8-whatif/include/asterisk/unicode.h   (with props)
    team/murf/utf8-whatif/main/unicode.c   (with props)
Modified:
    team/murf/utf8-whatif/main/Makefile

Added: team/murf/utf8-whatif/include/asterisk/unicode.h
URL: http://svn.digium.com/view/asterisk/team/murf/utf8-whatif/include/asterisk/unicode.h?view=auto&rev=89703
==============================================================================

--- team/murf/utf8-whatif/include/asterisk/unicode.h (added)
+++ team/murf/utf8-whatif/include/asterisk/unicode.h Tue Nov 27 13:45:40 2007
@@ -1,0 +1,51 @@
+/*
+ * Asterisk -- An open source telephony toolkit.
+ *
+ * Copyright (C) 2007, Digium, Inc.
+ *
+ * Steve Murphy <murf at digium.com>
+ *
+ * See http://www.asterisk.org for more information about
+ * the Asterisk project. Please do not directly contact
+ * any of the maintainers of this project for assistance;
+ * the project provides a web site, mailing lists and IRC
+ * channels for your use.
+ *
+ * This program is free software, distributed under the terms of
+ * the GNU General Public License Version 2. See the LICENSE file
+ * at the top of the source tree.
+ */
+
+#include <sys/types.h>
+typedef u_int32_t  ucs4_t;
+
+
+/* convert a single (possibly multi-byte) utf8 char to UCS4; next pts to char following the utf-8 char. */
+/* error is set if a problem is found in the utf8 encoding; the first byte will be returned as-is */
+
+ucs4_t ast_utf8_to_ucs4(unsigned char *utf8, unsigned char **next, int *error);
+
+/* convert a string of ucs4 chars into a string of utf-8 chars */
+
+unsigned char *ast_ucs4_to_utf8(ucs4_t *ucs, unsigned char *out, int outlen, ucs4_t **next);
+
+
+/* return a pointer the first place in ustr where theChar can be found, or 0 if nothing found */
+
+ucs4_t *ucs4_strchr(ucs4_t *ustr, ucs4_t theChar);
+
+
+/* convert a string of 8859-1 chars into a string of ucs4 chars --
+   mainly just by turning it from 8 bits to 32 bits/char.
+ */
+
+ucs4_t *ast_8859_1_to_ucs4(unsigned char *in, ucs4_t *ucs, int outlen, unsigned char **next);
+
+
+/* convert a string of ucs4 chars into a string of 8859-1 chars --
+   mainly just by turning it from 32 bits to 8 bits/char.
+ */
+
+unsigned char *ast_ucs4_to_8859_1(ucs4_t *ucs, unsigned char *out, int outlen, ucs4_t **next);
+
+

Propchange: team/murf/utf8-whatif/include/asterisk/unicode.h
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: team/murf/utf8-whatif/include/asterisk/unicode.h
------------------------------------------------------------------------------
    svn:keywords = Author Id Date Revision

Propchange: team/murf/utf8-whatif/include/asterisk/unicode.h
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: team/murf/utf8-whatif/main/Makefile
URL: http://svn.digium.com/view/asterisk/team/murf/utf8-whatif/main/Makefile?view=diff&rev=89703&r1=89702&r2=89703
==============================================================================
--- team/murf/utf8-whatif/main/Makefile (original)
+++ team/murf/utf8-whatif/main/Makefile Tue Nov 27 13:45:40 2007
@@ -27,7 +27,7 @@
 	netsock.o slinfactory.o ast_expr2.o ast_expr2f.o \
 	cryptostub.o sha1.o http.o fixedjitterbuf.o abstract_jb.o \
 	strcompat.o threadstorage.o dial.o event.o adsistub.o audiohook.o \
-	astobj2.o hashtab.o
+	astobj2.o hashtab.o unicode.o
 
 # we need to link in the objects statically, not as a library, because
 # otherwise modules will not have them available if none of the static

Added: team/murf/utf8-whatif/main/unicode.c
URL: http://svn.digium.com/view/asterisk/team/murf/utf8-whatif/main/unicode.c?view=auto&rev=89703
==============================================================================
--- team/murf/utf8-whatif/main/unicode.c (added)
+++ team/murf/utf8-whatif/main/unicode.c Tue Nov 27 13:45:40 2007
@@ -1,0 +1,232 @@
+
+/*
+ * Asterisk -- An open source telephony toolkit.
+ *
+ * Copyright (C) 2007, Digium, Inc.
+ *
+ * Steve Murphy <murf at digium.com>
+ *
+ * See http://www.asterisk.org for more information about
+ * the Asterisk project. Please do not directly contact
+ * any of the maintainers of this project for assistance;
+ * the project provides a web site, mailing lists and IRC
+ * channels for your use.
+ *
+ * This program is free software, distributed under the terms of
+ * the GNU General Public License Version 2. See the LICENSE file
+ * at the top of the source tree.
+ */
+
+#include "asterisk/unicode.h"
+#define NULL 0
+
+/* convert a single (possibly multi-byte) utf8 char to UCS4; next pts to char following the utf-8 char. */
+/* error is set if a problem is found in the utf8 encoding; the first byte will be returned as-is */
+
+ucs4_t ast_utf8_to_ucs4(unsigned char *utf8, unsigned char **next, int *error)
+{
+	if (((*utf8) & 0x80) == 0) { /* single byte value */
+		if (next)
+			*next = utf8 + 1;
+		if (error)
+			*error = 0;
+		return (ucs4_t)(*utf8);
+	}
+	if (((*utf8) & 0xE0) == 0xC0) { /* two byte value */
+		if (((*utf8+1) & 0xC0) == 0x80) { /* well formed */
+			ucs4_t t = (ucs4_t)(((*utf8) & 0x1F) << 6);
+			if (next)
+				*next = utf8 + 2;
+			if (error)
+				*error = 0;
+			return t + ((*utf8+1) & 0x3F);
+		} else { /* something is wrong! */
+			if (next)
+				*next = utf8 + 1;
+			if (error)
+				*error = 1;
+			return (ucs4_t)(*utf8);
+		}
+	}
+	if (((*utf8) & 0xF0) == 0xE0) { /* three byte value */
+		if (((*utf8+1) & 0xC0) == 0x80
+			&& ((*utf8+2) & 0xC0) == 0x80) { /* well formed */
+			
+			ucs4_t t1 = (ucs4_t)(((*utf8) & 0x0F) << 12);
+			ucs4_t t2 = (ucs4_t)(((*utf8+1) & 0x3F) << 6);
+			if (next)
+				*next = utf8 + 3;
+			if (error)
+				*error = 0;
+			return t1 + t2 + ((*utf8+2) & 0x3F);
+		} else {
+			if (next)
+				*next = utf8 + 1;
+			if (error)
+				*error = 1;
+			return (ucs4_t)(*utf8);
+		}
+	}
+	if (((*utf8) & 0xF8) == 0xF0) { /* four byte value */
+		if (((*utf8+1) & 0xC0) == 0x80
+			&& ((*utf8+2) & 0xC0) == 0x80
+			&& ((*utf8+3) & 0xC0) == 0x80) { /* well formed */
+			
+			ucs4_t t1 = (ucs4_t)(((ucs4_t)(*utf8) & 0x07) << 18);
+			ucs4_t t2 = (ucs4_t)(((ucs4_t)(*utf8+1) & 0x3F) << 12); /*  */
+			ucs4_t t3 = (ucs4_t)(((ucs4_t)(*utf8+2) & 0x3F) << 6);
+			if (next)
+				*next = utf8 + 4;
+			if (error)
+				*error = 0;
+			return t1 + t2 + t3 + ((*utf8+3) & 0x3F);
+		} else {
+			if (next)
+				*next = utf8 + 1;
+			if (error)
+				*error = 1;
+			return (ucs4_t)(*utf8);
+		}
+	}
+	/* if we are here, something is terribly wrong! */
+	if (next)
+		*next = utf8 + 1;
+	if (error)
+		*error = 1;
+	return (ucs4_t)(*utf8);
+}
+
+
+/* convert a string of ucs4 chars into a string of utf-8 chars */
+
+unsigned char *ast_ucs4_to_utf8(ucs4_t *ucs, unsigned char *out, int outlen, ucs4_t **next)
+{
+	unsigned char *buf = out;
+	
+	while (*ucs && outlen > 0)
+	{
+		if (*ucs < 0x80) { /* one byte out */
+			if (outlen < 2) {
+				/* no room left */
+				*next = ucs;
+				*out = 0;
+				return buf;
+			} else {
+				*out++ == (char)(*ucs);
+				outlen -= 1;
+			}
+		} else if (*ucs < 0x800) { /* two bytes out */
+			if (outlen < 3) {
+				/* no room left */
+				*next = ucs;
+				*out = 0;
+				return buf;
+			} else {
+				ucs4_t b1,b2;
+				b1 = ((*ucs) >> 6);
+				b2 = ((*ucs) & 0x3F);
+				*out++ == 0xC0 + (char)(b1);
+				*out++ == 0x80 + (char)(b2);
+				outlen -= 2;
+			}
+		} else if (*ucs < 0x10000) { /* three bytes out */
+			if (outlen < 4) {
+				/* no room left */
+				*next = ucs;
+				*out = 0;
+				return buf;
+			} else {
+				ucs4_t b1,b2,b3;
+				b1 = ((*ucs) >> 12);
+				b2 = (((*ucs) & 0xFFF) >> 6);
+				b3 = ((*ucs) & 0x3F);
+				*out++ == 0xE0 + (char)(b1);
+				*out++ == 0x80 + ((char)(b2));
+				*out++ == 0x80 + ((char)(b3));
+				outlen -= 3;
+			}
+		} else { /* four bytes out */
+			if (outlen < 5) {
+				/* no room left */
+				*next = ucs;
+				*out = 0;
+				return buf;
+			} else {
+				ucs4_t b1,b2,b3,b4;
+				b1 = ((*ucs) >> 18);
+				b2 = (((*ucs) & 0x3FFFF) >> 12);
+				b3 = (((*ucs) & 0xFFF) >> 6);
+				b4 = ((*ucs) & 0x3F);
+				*out++ == 0xF0 + (char)(b1);
+				*out++ == 0x80 + (char)(b2);
+				*out++ == 0x80 + (char)(b3);
+				*out++ == 0x80 + (char)(b4);
+				outlen -= 4;
+			}
+		}
+		ucs++;
+	}
+	*out = 0;
+	return buf;
+}
+
+/* convert a string of ucs4 chars into a string of 8859-1 chars --
+   mainly just by turning it from 32 bits to 8 bits/char.
+ */
+
+unsigned char *ast_ucs4_to_8859_1(ucs4_t *ucs, unsigned char *out, int outlen, ucs4_t **next)
+{
+	unsigned char *buf = out;
+	
+	while (*ucs && outlen > 1)
+	{
+		*buf++ = *ucs++; /* this should lop off all but the lowest 8 bits */
+		outlen--;
+		/* this is pretty brutal, but should be sufficient for simple 8859-1 conversion */
+	}
+	if (next)
+		*next = ucs;
+	*buf = 0; /* end the output string */
+	return out;
+}
+
+/* convert a string of 8859-1 chars into a string of ucs4 chars --
+   mainly just by turning it from 8 bits to 32 bits/char.
+ */
+
+ucs4_t *ast_8859_1_to_ucs4(unsigned char *in, ucs4_t *ucs, int outlen, unsigned char **next)
+{
+	ucs4_t *buf = ucs;
+	
+	while (*in && outlen > 1)
+	{
+		*buf++ = *in++; /* this should lop off all but the lowest 8 bits */
+		outlen--;
+		/* this is pretty brutal, but should be sufficient for simple 8859-1 conversion */
+	}
+	if (next)
+		*next = in;
+	*buf = 0; /* end the output string */
+	return ucs;
+}
+
+
+
+/* return a pointer the first place in ustr where theChar can be found, or 0 if nothing found */
+
+ucs4_t *ucs4_strchr(ucs4_t *ustr, ucs4_t theChar)
+{
+	if (!ustr)
+		return NULL;
+	
+	while (*ustr && *ustr != theChar)
+		ustr++;
+	
+	if (*ustr)
+		return ustr; /* the first match to theChar */
+	else
+		return NULL;
+}
+
+
+

Propchange: team/murf/utf8-whatif/main/unicode.c
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: team/murf/utf8-whatif/main/unicode.c
------------------------------------------------------------------------------
    svn:keywords = Author Id Date Revision

Propchange: team/murf/utf8-whatif/main/unicode.c
------------------------------------------------------------------------------
    svn:mime-type = text/plain