1
0
mirror of https://https.git.savannah.gnu.org/git/gnulib.git synced 2026-04-28 06:33:36 +00:00
Files
gnulib/lib/lc-charset-unicode.c
Bruno Haible ac19d2a1ff streq: Rename to streq-opt.
* lib/streq-opt.h: Renamed from lib/streq.h.
* modules/streq-opt: Renamed from modules/streq.
* NEWS: Mention the change.
* lib/c32is-impl.h: Include streq-opt.h instead of streq.h.
* lib/c32to-impl.h: Likewise.
* lib/c32rtomb.c: Likewise.
* lib/c32width.c: Likewise.
* lib/lc-charset-dispatch.c: Likewise.
* lib/lc-charset-unicode.c: Likewise.
* lib/mbrtoc32.c: Likewise.
* lib/uniwidth/cjk.h: Likewise.
* lib/wcwidth.c: Likewise.
* modules/c32is* (Depends-on): Add streq-opt. Remove streq.
* modules/c32to* (Depends-on): Likewise.
* modules/c32rtomb (Depends-on): Likewise.
* modules/mbrtoc32 (Depends-on): Likewise.
* modules/mbrtowc (Depends-on): Likewise.
* modules/uchar-h-c23 (Depends-on): Likewise.
* modules/unilbrk/u*-possible-linebreaks (Depends-on): Likewise.
* modules/uniwidth/width (Depends-on): Likewise.
* modules/wcwidth (Depends-on): Likewise.
* MODULES.html.sh: Update.
2026-02-21 08:02:52 +01:00

270 lines
8.0 KiB
C

/* Conversion between the current locale's character encoding and Unicode.
Copyright (C) 2023-2026 Free Software Foundation, Inc.
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of the
License, or (at your option) any later version.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
/* Written by Bruno Haible <bruno@clisp.org>, 2023. */
#include <config.h>
/* Specification. */
#include "lc-charset-unicode.h"
#if GL_CHAR32_T_IS_UNICODE && GL_CHAR32_T_VS_WCHAR_T_NEEDS_CONVERSION
/* We use iconv() to convert between a 'wchar_t' value and a Unicode code point.
For performance reasons, we don't allocate an iconv_t for each conversion,
but instead cache it for subsequent conversions. Since an iconv_t descriptor
can only be used in a single thread at a time, this cache must be
per-thread. */
# include <iconv.h>
# include <stdlib.h>
# include <string.h>
# include <wchar.h>
# include "localcharset.h"
# include "streq-opt.h"
# include "glthread/once.h"
# include "glthread/tls.h"
# include "unistr.h"
/* Maximum length of encoding. Attained for "ISO-8859-15". */
# define MAX_ENCODING_LEN 11
struct converters
{
iconv_t cd_locale_to_utf8;
iconv_t cd_utf8_to_locale;
/* NUL-terminated encoding name. */
char encoding[MAX_ENCODING_LEN + 1];
};
static gl_tls_key_t converters_key; /* TLS key for a 'struct converters *' */
/* Frees a 'struct converters *', for example when a thread terminates. */
static void
free_converters (void *p)
{
if (p != NULL)
{
struct converters *conv = p;
iconv_close (conv->cd_locale_to_utf8);
iconv_close (conv->cd_utf8_to_locale);
free (conv);
}
}
static void
key_init (void)
{
gl_tls_key_init (converters_key, free_converters);
/* The per-thread initial value is NULL. */
}
/* Ensure that key_init is called once only. */
gl_once_define(static, key_init_once)
/* Returns the per-thread 'struct converters *' that contains converters for the
given encoding. Returns NULL upon failure. */
static struct converters *
get_converters (const char *encoding)
{
if (strlen (encoding) > MAX_ENCODING_LEN)
/* If this happens, increase MAX_ENCODING_LEN. */
return NULL;
gl_once (key_init_once, key_init);
struct converters *conv = gl_tls_get (converters_key);
if (conv == NULL)
{
conv = (struct converters *) malloc (sizeof (struct converters));
if (conv == NULL)
/* Out of memory. */
return NULL;
conv->cd_locale_to_utf8 = iconv_open ("UTF-8", encoding);
conv->cd_utf8_to_locale = iconv_open (encoding, "UTF-8");
if (conv->cd_locale_to_utf8 == (iconv_t)(-1)
|| conv->cd_utf8_to_locale == (iconv_t)(-1))
{
/* iconv does not support this encoding. */
if (conv->cd_locale_to_utf8 != (iconv_t)(-1))
iconv_close (conv->cd_locale_to_utf8);
if (conv->cd_utf8_to_locale != (iconv_t)(-1))
iconv_close (conv->cd_utf8_to_locale);
free (conv);
return NULL;
}
strcpy (conv->encoding, encoding);
gl_tls_set (converters_key, conv);
}
else if (!streq (conv->encoding, encoding))
{
/* The locale encoding of this thread changed. */
iconv_t new_cd_locale_to_utf8 = iconv_open ("UTF-8", encoding);
iconv_t new_cd_utf8_to_locale = iconv_open (encoding, "UTF-8");
if (new_cd_locale_to_utf8 == (iconv_t)(-1)
|| new_cd_utf8_to_locale == (iconv_t)(-1))
{
/* iconv does not support this encoding. */
if (new_cd_locale_to_utf8 != (iconv_t)(-1))
iconv_close (new_cd_locale_to_utf8);
if (new_cd_utf8_to_locale != (iconv_t)(-1))
iconv_close (new_cd_utf8_to_locale);
return NULL;
}
iconv_close (conv->cd_locale_to_utf8);
iconv_close (conv->cd_utf8_to_locale);
conv->cd_locale_to_utf8 = new_cd_locale_to_utf8;
conv->cd_utf8_to_locale = new_cd_utf8_to_locale;
strcpy (conv->encoding, encoding);
}
return conv;
}
char32_t
locale_encoding_to_unicode (wchar_t wc)
{
/* This function is like a simplified variant of u32_conv_from_encoding,
that uses a cached per-thread iconv_t instead of allocating an iconv_t
at each call. */
if (wc == 0)
/* Invalid argument. */
abort ();
const char *encoding = locale_charset ();
if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
/* Assume that if the locale encoding is UTF-8, the wchar_t encoding is
Unicode. */
return wc;
if (STREQ_OPT (encoding, "ASCII", 'A', 'S', 'C', 'I', 'I', 0, 0, 0, 0))
/* In the POSIX locale, avoid conversion errors. */
return wc;
struct converters *conv = get_converters (encoding);
if (conv == NULL)
return 0;
char mbbuf[64];
size_t mbcnt;
{
mbstate_t state;
mbszero (&state);
mbcnt = wcrtomb (mbbuf, wc, &state);
if (mbcnt > sizeof (mbbuf))
/* wcrtomb did not recognize the wide character wc. */
abort ();
}
char utf8buf[6];
size_t utf8cnt;
{
char *mbptr = mbbuf;
size_t mbsize = mbcnt;
char *utf8ptr = utf8buf;
size_t utf8size = sizeof (utf8buf);
size_t ret = iconv (conv->cd_locale_to_utf8,
(ICONV_CONST char **) &mbptr, &mbsize,
&utf8ptr, &utf8size);
if (ret == (size_t)(-1))
/* Conversion error. */
return 0;
if (mbsize != 0)
/* The input was not entirely converted. */
return 0;
utf8cnt = sizeof (utf8buf) - utf8size; /* = utf8ptr - utf8buf */
if (utf8cnt == 0)
/* The conversion produced no output. */
return 0;
}
ucs4_t uc;
if (u8_mbtouc (&uc, (const uint8_t *) utf8buf, utf8cnt) != utf8cnt)
/* iconv produced an invalid UTF-8 byte sequence. */
abort ();
return uc;
}
wchar_t
unicode_to_locale_encoding (char32_t uc)
{
if (uc == 0)
/* Invalid argument. */
abort ();
/* This function is like a simplified variant of u32_conv_to_encoding
that uses a cached per-thread iconv_t instead of allocating an iconv_t
at each call. */
const char *encoding = locale_charset ();
if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
/* Assume that if the locale encoding is UTF-8, the wchar_t encoding is
Unicode. */
return uc;
if (STREQ_OPT (encoding, "ASCII", 'A', 'S', 'C', 'I', 'I', 0, 0, 0, 0))
/* In the POSIX locale, avoid conversion errors. */
return uc;
struct converters *conv = get_converters (encoding);
if (conv == NULL)
return 0;
char utf8buf[6];
int utf8cnt = u8_uctomb ((uint8_t *) utf8buf, uc, sizeof (utf8buf));
if (utf8cnt < 0)
/* Out-of-range Unicode character. */
return 0;
char mbbuf[64];
size_t mbcnt;
{
char *utf8ptr = utf8buf;
size_t utf8size = utf8cnt;
char *mbptr = mbbuf;
size_t mbsize = sizeof (mbbuf);
size_t ret = iconv (conv->cd_utf8_to_locale,
(ICONV_CONST char **) &utf8ptr, &utf8size,
&mbptr, &mbsize);
if (ret == (size_t)(-1))
/* Conversion error. */
return 0;
if (utf8size != 0)
/* The input was not entirely converted. */
return 0;
mbcnt = sizeof (mbbuf) - mbsize; /* = mbptr - mbbuf */
if (mbcnt == 0)
/* The conversion produced no output. */
return 0;
}
wchar_t wc;
{
mbstate_t state;
mbszero (&state);
if (mbrtowc (&wc, mbbuf, mbcnt, &state) != mbcnt)
/* iconv produced an invalid multibyte sequence. */
return 0;
}
return wc;
}
#else
/* This declaration is solely to ensure that after preprocessing
this file is never empty. */
typedef int dummy;
#endif