diff options
author | Werner Koch <[email protected]> | 2010-03-10 12:24:58 +0000 |
---|---|---|
committer | Werner Koch <[email protected]> | 2010-03-10 12:24:58 +0000 |
commit | d8b1099d01ebc1d305d47ec6dcb326980ad56396 (patch) | |
tree | cb92563d8b116872c34ad26dcdacfd02ed3da04a /jnlib/utf8conv.c | |
parent | Fix for latest libgpg-error. (diff) | |
download | gnupg-d8b1099d01ebc1d305d47ec6dcb326980ad56396.tar.gz gnupg-d8b1099d01ebc1d305d47ec6dcb326980ad56396.zip |
Merged jnlib into common.
Diffstat (limited to 'jnlib/utf8conv.c')
-rw-r--r-- | jnlib/utf8conv.c | 813 |
1 files changed, 0 insertions, 813 deletions
diff --git a/jnlib/utf8conv.c b/jnlib/utf8conv.c deleted file mode 100644 index 6cbe4e92c..000000000 --- a/jnlib/utf8conv.c +++ /dev/null @@ -1,813 +0,0 @@ -/* utf8conf.c - UTF8 character set conversion - * Copyright (C) 1994, 1998, 1999, 2000, 2001, 2003, 2006, - * 2008, 2010 Free Software Foundation, Inc. - * - * This file is part of JNLIB. - * - * JNLIB is free software; you can redistribute it and/or modify it - * under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 3 of - * the License, or (at your option) any later version. - * - * JNLIB is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, see <http://www.gnu.org/licenses/>. - */ - -#include <config.h> -#include <stdlib.h> -#include <string.h> -#include <stdarg.h> -#include <ctype.h> -#ifdef HAVE_LANGINFO_CODESET -#include <langinfo.h> -#endif -#include <errno.h> -#ifndef HAVE_W32_SYSTEM -# include <iconv.h> -#endif - -#include "libjnlib-config.h" -#include "stringhelp.h" -#include "dynload.h" -#include "utf8conv.h" - -#ifndef MB_LEN_MAX -#define MB_LEN_MAX 16 -#endif - -static const char *active_charset_name = "iso-8859-1"; -static int no_translation; /* Set to true if we let simply pass through. */ -static int use_iconv; /* iconv comversion fucntions required. */ - - -/* Under W32 we dlopen the iconv dll and don't require any iconv - related headers at all. However we need to define some stuff. */ -#ifdef HAVE_W32_SYSTEM -typedef void *iconv_t; -#ifndef ICONV_CONST -#define ICONV_CONST -#endif -static iconv_t (* __stdcall iconv_open) (const char *tocode, - const char *fromcode); -static size_t (* __stdcall iconv) (iconv_t cd, - char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft); -static int (* __stdcall iconv_close) (iconv_t cd); - -static int -load_libiconv (void) -{ - static int done; - - if (!done) - { - void *handle; - - done = 1; /* Do it right now because we might get called recursivly - through gettext. */ - - handle = dlopen ("iconv.dll", RTLD_LAZY); - if (handle) - { - iconv_open = dlsym (handle, "libiconv_open"); - if (iconv_open) - iconv = dlsym (handle, "libiconv"); - if (iconv) - iconv_close = dlsym (handle, "libiconv_close"); - } - if (!handle || !iconv_close) - { - log_info (_("error loading `%s': %s\n"), - "iconv.dll", dlerror ()); - log_info (_("please see %s for more information\n"), - "http://www.gnupg.org/download/iconv.html"); - iconv_open = NULL; - iconv = NULL; - iconv_close = NULL; - if (handle) - dlclose (handle); - } - } - return iconv_open? 0: -1; -} -#endif /*HAVE_W32_SYSTEM*/ - - -/* Error handler for iconv failures. This is needed to not clutter the - output with repeated diagnostics about a missing conversion. */ -static void -handle_iconv_error (const char *to, const char *from, int use_fallback) -{ - if (errno == EINVAL) - { - static int shown1, shown2; - int x; - - if (to && !strcmp (to, "utf-8")) - { - x = shown1; - shown1 = 1; - } - else - { - x = shown2; - shown2 = 1; - } - - if (!x) - log_info (_("conversion from `%s' to `%s' not available\n"), - from, to); - } - else - { - static int shown; - - if (!shown) - log_info (_("iconv_open failed: %s\n"), strerror (errno)); - shown = 1; - } - - if (use_fallback) - { - /* To avoid further error messages we fallback to Latin-1 for the - native encoding. This is justified as one can expect that on a - utf-8 enabled system nl_langinfo() will work and thus we won't - never get to here. Thus Latin-1 seems to be a reasonable - default. */ - active_charset_name = "iso-8859-1"; - no_translation = 0; - use_iconv = 0; - } -} - - - -int -set_native_charset (const char *newset) -{ - const char *full_newset; - - if (!newset) - { -#ifdef HAVE_W32_SYSTEM - static char codepage[30]; - unsigned int cpno; - const char *aliases; - - /* We are a console program thus we need to use the - GetConsoleOutputCP function and not the the GetACP which - would give the codepage for a GUI program. Note this is not - a bulletproof detection because GetConsoleCP might return a - different one for console input. Not sure how to cope with - that. If the console Code page is not known we fall back to - the system code page. */ -#ifndef HAVE_W32CE_SYSTEM - cpno = GetConsoleOutputCP (); - if (!cpno) -#endif - cpno = GetACP (); - sprintf (codepage, "CP%u", cpno ); - /* Resolve alias. We use a long string string and not the usual - array to optimize if the code is taken to a DSO. Taken from - libiconv 1.9.2. */ - newset = codepage; - for (aliases = ("CP936" "\0" "GBK" "\0" - "CP1361" "\0" "JOHAB" "\0" - "CP20127" "\0" "ASCII" "\0" - "CP20866" "\0" "KOI8-R" "\0" - "CP21866" "\0" "KOI8-RU" "\0" - "CP28591" "\0" "ISO-8859-1" "\0" - "CP28592" "\0" "ISO-8859-2" "\0" - "CP28593" "\0" "ISO-8859-3" "\0" - "CP28594" "\0" "ISO-8859-4" "\0" - "CP28595" "\0" "ISO-8859-5" "\0" - "CP28596" "\0" "ISO-8859-6" "\0" - "CP28597" "\0" "ISO-8859-7" "\0" - "CP28598" "\0" "ISO-8859-8" "\0" - "CP28599" "\0" "ISO-8859-9" "\0" - "CP28605" "\0" "ISO-8859-15" "\0" - "CP65001" "\0" "UTF-8" "\0"); - *aliases; - aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) - { - if (!strcmp (codepage, aliases) ||(*aliases == '*' && !aliases[1])) - { - newset = aliases + strlen (aliases) + 1; - break; - } - } - -#else /*!HAVE_W32_SYSTEM*/ - -#ifdef HAVE_LANGINFO_CODESET - newset = nl_langinfo (CODESET); -#else /*!HAVE_LANGINFO_CODESET*/ - /* Try to get the used charset from environment variables. */ - static char codepage[30]; - const char *lc, *dot, *mod; - - strcpy (codepage, "iso-8859-1"); - lc = getenv ("LC_ALL"); - if (!lc || !*lc) - { - lc = getenv ("LC_CTYPE"); - if (!lc || !*lc) - lc = getenv ("LANG"); - } - if (lc && *lc) - { - dot = strchr (lc, '.'); - if (dot) - { - mod = strchr (++dot, '@'); - if (!mod) - mod = dot + strlen (dot); - if (mod - dot < sizeof codepage && dot != mod) - { - memcpy (codepage, dot, mod - dot); - codepage [mod - dot] = 0; - } - } - } - newset = codepage; -#endif /*!HAVE_LANGINFO_CODESET*/ -#endif /*!HAVE_W32_SYSTEM*/ - } - - full_newset = newset; - if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3)) - { - newset += 3; - if (*newset == '-' || *newset == '_') - newset++; - } - - /* Note that we silently assume that plain ASCII is actually meant - as Latin-1. This makes sense because many Unix system don't have - their locale set up properly and thus would get annoying error - messages and we have to handle all the "bug" reports. Latin-1 has - always been the character set used for 8 bit characters on Unix - systems. */ - if ( !*newset - || !ascii_strcasecmp (newset, "8859-1" ) - || !ascii_strcasecmp (newset, "646" ) - || !ascii_strcasecmp (newset, "ASCII" ) - || !ascii_strcasecmp (newset, "ANSI_X3.4-1968" ) - ) - { - active_charset_name = "iso-8859-1"; - no_translation = 0; - use_iconv = 0; - } - else if ( !ascii_strcasecmp (newset, "utf8" ) - || !ascii_strcasecmp(newset, "utf-8") ) - { - active_charset_name = "utf-8"; - no_translation = 1; - use_iconv = 0; - } - else - { - iconv_t cd; - -#ifdef HAVE_W32_SYSTEM - if (load_libiconv ()) - return -1; -#endif /*HAVE_W32_SYSTEM*/ - - cd = iconv_open (full_newset, "utf-8"); - if (cd == (iconv_t)-1) - { - handle_iconv_error (full_newset, "utf-8", 0); - return -1; - } - iconv_close (cd); - cd = iconv_open ("utf-8", full_newset); - if (cd == (iconv_t)-1) - { - handle_iconv_error ("utf-8", full_newset, 0); - return -1; - } - iconv_close (cd); - active_charset_name = full_newset; - no_translation = 0; - use_iconv = 1; - } - return 0; -} - -const char * -get_native_charset () -{ - return active_charset_name; -} - -/* Return true if the native charset is utf-8. */ -int -is_native_utf8 (void) -{ - return no_translation; -} - - -/* Convert string, which is in native encoding to UTF8 and return a - new allocated UTF-8 string. This function terminates the process - on memory shortage. */ -char * -native_to_utf8 (const char *orig_string) -{ - const unsigned char *string = (const unsigned char *)orig_string; - const unsigned char *s; - char *buffer; - unsigned char *p; - size_t length = 0; - - if (no_translation) - { - /* Already utf-8 encoded. */ - buffer = jnlib_xstrdup (orig_string); - } - else if (!use_iconv) - { - /* For Latin-1 we can avoid the iconv overhead. */ - for (s = string; *s; s++) - { - length++; - if (*s & 0x80) - length++; - } - buffer = jnlib_xmalloc (length + 1); - for (p = (unsigned char *)buffer, s = string; *s; s++) - { - if ( (*s & 0x80 )) - { - *p++ = 0xc0 | ((*s >> 6) & 3); - *p++ = 0x80 | (*s & 0x3f); - } - else - *p++ = *s; - } - *p = 0; - } - else - { - /* Need to use iconv. */ - iconv_t cd; - const char *inptr; - char *outptr; - size_t inbytes, outbytes; - - cd = iconv_open ("utf-8", active_charset_name); - if (cd == (iconv_t)-1) - { - handle_iconv_error ("utf-8", active_charset_name, 1); - return native_to_utf8 (string); - } - - for (s=string; *s; s++ ) - { - length++; - if ((*s & 0x80)) - length += 5; /* We may need up to 6 bytes for the utf8 output. */ - } - buffer = jnlib_xmalloc (length + 1); - - inptr = string; - inbytes = strlen (string); - outptr = buffer; - outbytes = length; - if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes, - &outptr, &outbytes) == (size_t)-1) - { - static int shown; - - if (!shown) - log_info (_("conversion from `%s' to `%s' failed: %s\n"), - active_charset_name, "utf-8", strerror (errno)); - shown = 1; - /* We don't do any conversion at all but use the strings as is. */ - strcpy (buffer, string); - } - else /* Success. */ - { - *outptr = 0; - /* We could realloc the buffer now but I doubt that it makes - much sense given that it will get freed anyway soon - after. */ - } - iconv_close (cd); - } - return buffer; -} - - - -static char * -do_utf8_to_native (const char *string, size_t length, int delim, - int with_iconv) -{ - int nleft; - int i; - unsigned char encbuf[8]; - int encidx; - const unsigned char *s; - size_t n; - char *buffer = NULL; - char *p = NULL; - unsigned long val = 0; - size_t slen; - int resync = 0; - - /* First pass (p==NULL): count the extended utf-8 characters. */ - /* Second pass (p!=NULL): create string. */ - for (;;) - { - for (slen = length, nleft = encidx = 0, n = 0, - s = (const unsigned char *)string; - slen; - s++, slen--) - { - if (resync) - { - if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd))) - { - /* Still invalid. */ - if (p) - { - sprintf (p, "\\x%02x", *s); - p += 4; - } - n += 4; - continue; - } - resync = 0; - } - if (!nleft) - { - if (!(*s & 0x80)) - { - /* Plain ascii. */ - if ( delim != -1 - && (*s < 0x20 || *s == 0x7f || *s == delim - || (delim && *s == '\\'))) - { - n++; - if (p) - *p++ = '\\'; - switch (*s) - { - case '\n': n++; if ( p ) *p++ = 'n'; break; - case '\r': n++; if ( p ) *p++ = 'r'; break; - case '\f': n++; if ( p ) *p++ = 'f'; break; - case '\v': n++; if ( p ) *p++ = 'v'; break; - case '\b': n++; if ( p ) *p++ = 'b'; break; - case 0: n++; if ( p ) *p++ = '0'; break; - default: - n += 3; - if (p) - { - sprintf (p, "x%02x", *s); - p += 3; - } - break; - } - } - else - { - if (p) - *p++ = *s; - n++; - } - } - else if ((*s & 0xe0) == 0xc0) /* 110x xxxx */ - { - val = *s & 0x1f; - nleft = 1; - encidx = 0; - encbuf[encidx++] = *s; - } - else if ((*s & 0xf0) == 0xe0) /* 1110 xxxx */ - { - val = *s & 0x0f; - nleft = 2; - encidx = 0; - encbuf[encidx++] = *s; - } - else if ((*s & 0xf8) == 0xf0) /* 1111 0xxx */ - { - val = *s & 0x07; - nleft = 3; - encidx = 0; - encbuf[encidx++] = *s; - } - else if ((*s & 0xfc) == 0xf8) /* 1111 10xx */ - { - val = *s & 0x03; - nleft = 4; - encidx = 0; - encbuf[encidx++] = *s; - } - else if ((*s & 0xfe) == 0xfc) /* 1111 110x */ - { - val = *s & 0x01; - nleft = 5; - encidx = 0; - encbuf[encidx++] = *s; - } - else /* Invalid encoding: print as \xNN. */ - { - if (p) - { - sprintf (p, "\\x%02x", *s); - p += 4; - } - n += 4; - resync = 1; - } - } - else if (*s < 0x80 || *s >= 0xc0) /* Invalid utf-8 */ - { - if (p) - { - for (i = 0; i < encidx; i++) - { - sprintf (p, "\\x%02x", encbuf[i]); - p += 4; - } - sprintf (p, "\\x%02x", *s); - p += 4; - } - n += 4 + 4 * encidx; - nleft = 0; - encidx = 0; - resync = 1; - } - else - { - encbuf[encidx++] = *s; - val <<= 6; - val |= *s & 0x3f; - if (!--nleft) /* Ready. */ - { - if (no_translation) - { - if (p) - { - for (i = 0; i < encidx; i++) - *p++ = encbuf[i]; - } - n += encidx; - encidx = 0; - } - else if (with_iconv) - { - /* Our strategy for using iconv is a bit strange - but it better keeps compatibility with - previous versions in regard to how invalid - encodings are displayed. What we do is to - keep the utf-8 as is and have the real - translation step then at the end. Yes, I - know that this is ugly. However we are short - of the 1.4 release and for this branch we - should not mess too much around with iconv - things. One reason for this is that we don't - know enough about non-GNU iconv - implementation and want to minimize the risk - of breaking the code on too many platforms. */ - if ( p ) - { - for (i=0; i < encidx; i++ ) - *p++ = encbuf[i]; - } - n += encidx; - encidx = 0; - } - else /* Latin-1 case. */ - { - if (val >= 0x80 && val < 256) - { - /* We can simply print this character */ - n++; - if (p) - *p++ = val; - } - else - { - /* We do not have a translation: print utf8. */ - if (p) - { - for (i = 0; i < encidx; i++) - { - sprintf (p, "\\x%02x", encbuf[i]); - p += 4; - } - } - n += encidx * 4; - encidx = 0; - } - } - } - - } - } - if (!buffer) - { - /* Allocate the buffer after the first pass. */ - buffer = p = jnlib_xmalloc (n + 1); - } - else if (with_iconv) - { - /* Note: See above for comments. */ - iconv_t cd; - const char *inptr; - char *outbuf, *outptr; - size_t inbytes, outbytes; - - *p = 0; /* Terminate the buffer. */ - - cd = iconv_open (active_charset_name, "utf-8"); - if (cd == (iconv_t)-1) - { - handle_iconv_error (active_charset_name, "utf-8", 1); - jnlib_free (buffer); - return utf8_to_native (string, length, delim); - } - - /* Allocate a new buffer large enough to hold all possible - encodings. */ - n = p - buffer + 1; - inbytes = n - 1;; - inptr = buffer; - outbytes = n * MB_LEN_MAX; - if (outbytes / MB_LEN_MAX != n) - BUG (); /* Actually an overflow. */ - outbuf = outptr = jnlib_xmalloc (outbytes); - if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes, - &outptr, &outbytes) == (size_t)-1) - { - static int shown; - - if (!shown) - log_info (_("conversion from `%s' to `%s' failed: %s\n"), - "utf-8", active_charset_name, strerror (errno)); - shown = 1; - /* Didn't worked out. Try again but without iconv. */ - jnlib_free (buffer); - buffer = NULL; - jnlib_free (outbuf); - outbuf = do_utf8_to_native (string, length, delim, 0); - } - else /* Success. */ - { - *outptr = 0; /* Make sure it is a string. */ - /* We could realloc the buffer now but I doubt that it - makes much sense given that it will get freed - anyway soon after. */ - jnlib_free (buffer); - } - iconv_close (cd); - return outbuf; - } - else /* Not using iconv. */ - { - *p = 0; /* Make sure it is a string. */ - return buffer; - } - } -} - -/* Convert string, which is in UTF-8 to native encoding. Replace - illegal encodings by some "\xnn" and quote all control - characters. A character with value DELIM will always be quoted, it - must be a vanilla ASCII character. A DELIM value of -1 is special: - it disables all quoting of control characters. This function - terminates the process on memory shortage. */ -char * -utf8_to_native (const char *string, size_t length, int delim) -{ - return do_utf8_to_native (string, length, delim, use_iconv); -} - - - - -/* Wrapper function for iconv_open, required for W32 as we dlopen that - library on that system. */ -jnlib_iconv_t -jnlib_iconv_open (const char *tocode, const char *fromcode) -{ -#ifdef HAVE_W32_SYSTEM - if (load_libiconv ()) - return (jnlib_iconv_t)(-1); -#endif /*HAVE_W32_SYSTEM*/ - - return (jnlib_iconv_t)iconv_open (tocode, fromcode); -} - - -/* Wrapper function for iconv, required for W32 as we dlopen that - library on that system. */ -size_t -jnlib_iconv (jnlib_iconv_t cd, - const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft) -{ - -#ifdef HAVE_W32_SYSTEM - if (load_libiconv ()) - return 0; -#endif /*HAVE_W32_SYSTEM*/ - - return iconv ((iconv_t)cd, (char**)inbuf, inbytesleft, outbuf, outbytesleft); -} - -/* Wrapper function for iconv_close, required for W32 as we dlopen that - library on that system. */ -int -jnlib_iconv_close (jnlib_iconv_t cd) -{ -#ifdef HAVE_W32_SYSTEM - if (load_libiconv ()) - return 0; -#endif /*HAVE_W32_SYSTEM*/ - - return iconv_close ((iconv_t)cd); -} - - -#ifdef HAVE_W32_SYSTEM -/* Return a malloced string encoded in UTF-8 from the wide char input - string STRING. Caller must free this value. Returns NULL and sets - ERRNO on failure. Calling this function with STRING set to NULL is - not defined. */ -char * -wchar_to_utf8 (const wchar_t *string) -{ - int n; - char *result; - - n = WideCharToMultiByte (CP_UTF8, 0, string, -1, NULL, 0, NULL, NULL); - if (n < 0) - { - jnlib_set_errno (EINVAL); - return NULL; - } - - result = jnlib_malloc (n+1); - if (!result) - return NULL; - - n = WideCharToMultiByte (CP_UTF8, 0, string, -1, result, n, NULL, NULL); - if (n < 0) - { - jnlib_free (result); - jnlib_set_errno (EINVAL); - result = NULL; - } - return result; -} - - -/* Return a malloced wide char string from an UTF-8 encoded input - string STRING. Caller must free this value. Returns NULL and sets - ERRNO on failure. Calling this function with STRING set to NULL is - not defined. */ -wchar_t * -utf8_to_wchar (const char *string) -{ - int n; - size_t nbytes; - wchar_t *result; - - n = MultiByteToWideChar (CP_UTF8, 0, string, -1, NULL, 0); - if (n < 0) - { - jnlib_set_errno (EINVAL); - return NULL; - } - - nbytes = (size_t)(n+1) * sizeof(*result); - if (nbytes / sizeof(*result) != (n+1)) - { - jnlib_set_errno (ENOMEM); - return NULL; - } - result = malloc (nbytes); - if (!result) - return NULL; - - n = MultiByteToWideChar (CP_UTF8, 0, string, -1, result, n); - if (n < 0) - { - free (result); - jnlib_set_errno (EINVAL); - result = NULL; - } - return result; -} -#endif /*HAVE_W32_SYSTEM*/ |