ICUの正規化のサンプル
UnicodeのライブラリであるICU (http://site.icu-project.org/) の正規化を試してみました。
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <unicode/umachine.h> #include <unicode/unorm2.h> #include <unicode/ustring.h> #include <unicode/utf.h> #define array_sizeof(a) (sizeof(a) / sizeof(a[0])) static void utf8_to_hex(char* dest, const char* src) { const char* p; char* q = dest; for (p = src; *p != '\0'; p++) { sprintf(q, "\\x%02x", 0xff & (*p)); q += 4; } } static void normalize(const char* name, UNormalization2Mode mode) { UChar32 ch; for (ch = 0; ch < 0x110000; ch++) { if (!U_IS_UNICODE_CHAR(ch)) { continue; } #define CHECK_ERROR(name) do { \ if (U_FAILURE(e)) { \ fprintf(stderr, "%s failed - %s\n", name, u_errorName(e)); \ exit(1); \ } \ } while (0) UChar32 utf32_src[2]; utf32_src[0] = ch; utf32_src[1] = 0; UChar utf16_src[1024]; UErrorCode e = U_ZERO_ERROR; u_strFromUTF32(utf16_src, array_sizeof(utf16_src), NULL, utf32_src, -1, &e); CHECK_ERROR("u_strFromUTF32"); char utf8_src[1024]; u_strToUTF8(utf8_src, array_sizeof(utf8_src), NULL, utf16_src, -1, &e); CHECK_ERROR("u_strToUTF8"); const UNormalizer2* norm2 = unorm2_getInstance(NULL, name, mode, &e); CHECK_ERROR("unorm2_getInstance"); UChar utf16_dest[1024]; unorm2_normalize(norm2, utf16_src, -1, utf16_dest, array_sizeof(utf16_dest), &e); CHECK_ERROR("unorm2_normalize"); char utf8_dest[1024]; u_strToUTF8(utf8_dest, array_sizeof(utf8_dest), NULL, utf16_dest, -1, &e); CHECK_ERROR("u_strToUTF8"); if (strcmp(utf8_src, utf8_dest) == 0) { continue; } char hex_src[1024]; utf8_to_hex(hex_src, utf8_src); char hex_dest[1024]; utf8_to_hex(hex_dest, utf8_dest); printf("%s (%s) -> %s (%s)\n", utf8_src, hex_src, utf8_dest, hex_dest); #undef CHECK_ERROR } } int main(int argc, const char* argv[]) { printf("--- nfc, UNORM2_COMPOSE ---\n"); normalize("nfc", UNORM2_COMPOSE); printf("--- nfc, UNORM2_DECOMPOSE ---\n"); normalize("nfc", UNORM2_DECOMPOSE); return 0; } /** * vim: tabstop=4 shiftwidth=4 expandtab softtabstop=4 */
上のコードをmain.cとした場合、
$ gcc -o normalize main.c -licuuc
でコンパイルできます。これを実行すると、
$ ./normalize --- nfc, UNORM2_COMPOSE --- ̀ (\xcd\x80) -> ̀ (\xcc\x80) ́ (\xcd\x81) -> ́ (\xcc\x81) ̓ (\xcd\x83) -> ̓ (\xcc\x93) ̈́ (\xcd\x84) -> ̈́ (\xcc\x88\xcc\x81) ʹ (\xcd\xb4) -> ʹ (\xca\xb9) ; (\xcd\xbe) -> ; (\x3b) · (\xce\x87) -> · (\xc2\xb7) क़ (\xe0\xa5\x98) -> क़ (\xe0\xa4\x95\xe0\xa4\xbc) ख़ (\xe0\xa5\x99) -> ख़ (\xe0\xa4\x96\xe0\xa4\xbc) (略) --- nfc, UNORM2_DECOMPOSE --- À (\xc3\x80) -> À (\x41\xcc\x80) Á (\xc3\x81) -> Á (\x41\xcc\x81) Â (\xc3\x82) -> Â (\x41\xcc\x82) Ã (\xc3\x83) -> Ã (\x41\xcc\x83) Ä (\xc3\x84) -> Ä (\x41\xcc\x88) Å (\xc3\x85) -> Å (\x41\xcc\x8a) Ç (\xc3\x87) -> Ç (\x43\xcc\xa7) È (\xc3\x88) -> È (\x45\xcc\x80) É (\xc3\x89) -> É (\x45\xcc\x81) Ê (\xc3\x8a) -> Ê (\x45\xcc\x82) Ë (\xc3\x8b) -> Ë (\x45\xcc\x88) Ì (\xc3\x8c) -> Ì (\x49\xcc\x80) Í (\xc3\x8d) -> Í (\x49\xcc\x81) (以下略)
のように、正規結合と正規分解の結果が表示されます。