xfs
[Top] [All Lists]

[PATCH 07/13] libxfs: add trie generator and supporting code for UTF-8.

To: xfs@xxxxxxxxxxx
Subject: [PATCH 07/13] libxfs: add trie generator and supporting code for UTF-8.
From: Ben Myers <bpm@xxxxxxx>
Date: Thu, 11 Sep 2014 15:59:01 -0500
Cc: olaf@xxxxxxx, tinguely@xxxxxxx
Delivered-to: xfs@xxxxxxxxxxx
In-reply-to: <20140911203735.GA19952@xxxxxxx>
References: <20140911203735.GA19952@xxxxxxx>
User-agent: Mutt/1.5.20 (2009-06-14)
From: Olaf Weber <olaf@xxxxxxx>

mkutf8data.c is the source for a program that generates utf8data.h, which
contains the trie that utf8norm.c uses. The trie is generated from the
Unicode 7.0.0 data files. The format of the utf8data[] table is described
in utf8norm.c.

Supporting functions for UTF-8 normalization are in utf8norm.c with the
header utf8norm.h. Two normalization forms are supported: nfkdi and nfkdicf.

  nfkdi:
   - Apply unicode normalization form NFKD.
   - Remove any Default_Ignorable_Code_Point.

  nfkdicf:
   - Apply unicode normalization form NFKD.
   - Remove any Default_Ignorable_Code_Point.
   - Apply a full casefold (C + F).

For the purposes of the code, a string is valid UTF-8 if:

 - The values encoded are 0x1..0x10FFFF.
 - The surrogate codepoints 0xD800..0xDFFFF are not encoded.
 - The shortest possible encoding is used for all values.

The supporting functions work on null-terminated strings (utf8 prefix) and
on length-limited strings (utf8n prefix).

Signed-off-by: Olaf Weber <olaf@xxxxxxx>
---
 include/utf8norm.h   |  111 ++
 libxfs/utf8norm.c    |  628 ++++++++++
 support/mkutf8data.c | 3232 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 3971 insertions(+)
 create mode 100644 include/utf8norm.h
 create mode 100644 libxfs/utf8norm.c
 create mode 100644 support/mkutf8data.c

diff --git a/include/utf8norm.h b/include/utf8norm.h
new file mode 100644
index 0000000..6aa3391
--- /dev/null
+++ b/include/utf8norm.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2014 SGI.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef UTF8NORM_H
+#define UTF8NORM_H
+
+/* An opaque type used to determine the normalization in use. */
+typedef const struct utf8data *utf8data_t;
+
+/* Encoding a unicode version number as a single unsigned int. */
+#define UNICODE_MAJ_SHIFT              (16)
+#define UNICODE_MIN_SHIFT              (8)
+
+#define UNICODE_AGE(MAJ,MIN,REV)                       \
+       (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |   \
+        ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |   \
+        ((unsigned int)(REV)))
+
+/* Highest unicode version supported by the data tables. */
+extern const unsigned int utf8version;
+
+/*
+ * Look for the correct utf8data_t for a unicode version.
+ * Returns NULL if the version requested is too new.
+ *
+ * Two normalization forms are supported: nfkdi and nfkdicf.
+ *
+ * nfkdi:
+ *  - Apply unicode normalization form NFKD.
+ *  - Remove any Default_Ignorable_Code_Point.
+ *
+ * nfkdicf:
+ *  - Apply unicode normalization form NFKD.
+ *  - Remove any Default_Ignorable_Code_Point.
+ *  - Apply a full casefold (C + F).
+ */
+extern utf8data_t utf8nfkdi(unsigned int);
+extern utf8data_t utf8nfkdicf(unsigned int);
+
+/*
+ * Determine the maximum age of any unicode character in the string.
+ * Returns 0 if only unassigned code points are present.
+ * Returns -1 if the input is not valid UTF-8.
+ */
+extern int utf8agemax(utf8data_t, const char *);
+extern int utf8nagemax(utf8data_t, const char *, size_t);
+
+/*
+ * Determine the minimum age of any unicode character in the string.
+ * Returns 0 if any unassigned code points are present.
+ * Returns -1 if the input is not valid UTF-8.
+ */
+extern int utf8agemin(utf8data_t, const char *);
+extern int utf8nagemin(utf8data_t, const char *, size_t);
+
+/*
+ * Determine the length of the normalized from of the string,
+ * excluding any terminating NULL byte.
+ * Returns 0 if only ignorable code points are present.
+ * Returns -1 if the input is not valid UTF-8.
+ */
+extern ssize_t utf8len(utf8data_t, const char *);
+extern ssize_t utf8nlen(utf8data_t, const char *, size_t);
+
+/*
+ * Cursor structure used by the normalizer.
+ */
+struct utf8cursor {
+       utf8data_t      data;
+       const char      *s;
+       const char      *p;
+       const char      *ss;
+       const char      *sp;
+       unsigned int    len;
+       unsigned int    slen;
+       short int       ccc;
+       short int       nccc;
+};
+
+/*
+ * Initialize a utf8cursor to normalize a string.
+ * Returns 0 on success.
+ * Returns -1 on failure.
+ */
+extern int utf8cursor(struct utf8cursor *, utf8data_t, const char *);
+extern int utf8ncursor(struct utf8cursor *, utf8data_t, const char *, size_t);
+
+/*
+ * Get the next byte in the normalization.
+ * Returns a value > 0 && < 256 on success.
+ * Returns 0 when the end of the normalization is reached.
+ * Returns -1 if the string being normalized is not valid UTF-8.
+ */
+extern int utf8byte(struct utf8cursor *);
+
+#endif /* UTF8NORM_H */
diff --git a/libxfs/utf8norm.c b/libxfs/utf8norm.c
new file mode 100644
index 0000000..6232d1a
--- /dev/null
+++ b/libxfs/utf8norm.c
@@ -0,0 +1,628 @@
+/*
+ * Copyright (c) 2014 SGI.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_types.h"
+#include <utf8norm.h>
+
+struct utf8data {
+       unsigned int maxage;
+       unsigned int offset;
+};
+
+#define __INCLUDED_FROM_UTF8NORM_C__
+#include <utf8data.h>
+#undef __INCLUDED_FROM_UTF8NORM_C__
+
+/*
+ * UTF-8 valid ranges.
+ *
+ * The UTF-8 encoding spreads the bits of a 32bit word over several
+ * bytes. This table gives the ranges that can be held and how they'd
+ * be represented.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * There is an additional requirement on UTF-8, in that only the
+ * shortest representation of a 32bit value is to be used.  A decoder
+ * must not decode sequences that do not satisfy this requirement.
+ * Thus the allowed ranges have a lower bound.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
+ * 17 planes of 65536 values.  This limits the sequences actually seen
+ * even more, to just the following.
+ *
+ *          0 -     0x7F: 0                   - 0x7F
+ *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
+ *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
+ *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
+ *
+ * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
+ *
+ * Note that the longest sequence seen with valid usage is 4 bytes,
+ * the same a single UTF-32 character.  This makes the UTF-8
+ * representation of Unicode strictly smaller than UTF-32.
+ *
+ * The shortest sequence requirement was introduced by:
+ *    Corrigendum #1: UTF-8 Shortest Form
+ * It can be found here:
+ *    http://www.unicode.org/versions/corrigendum1.html
+ *
+ */
+
+/*
+ * Return the number of bytes used by the current UTF-8 sequence.
+ * Assumes the input points to the first byte of a valid UTF-8
+ * sequence.
+ */
+static inline int
+utf8clen(const char *s)
+{
+       unsigned char c = *s;
+       return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
+}
+
+/*
+ * utf8trie_t
+ *
+ * A compact binary tree, used to decode UTF-8 characters.
+ *
+ * Internal nodes are one byte for the node itself, and up to three
+ * bytes for an offset into the tree.  The first byte contains the
+ * following information:
+ *  NEXTBYTE  - flag        - advance to next byte if set
+ *  BITNUM    - 3 bit field - the bit number to tested
+ *  OFFLEN    - 2 bit field - number of bytes in the offset
+ * if offlen == 0 (non-branching node)
+ *  RIGHTPATH - 1 bit field - set if the following node is for the
+ *                            right-hand path (tested bit is set)
+ *  TRIENODE  - 1 bit field - set if the following node is an internal
+ *                            node, otherwise it is a leaf node
+ * if offlen != 0 (branching node)
+ *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
+ *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
+ *
+ * Due to the way utf8 works, there cannot be branching nodes with
+ * NEXTBYTE set, and moreover those nodes always have a righthand
+ * descendant.
+ */
+typedef const unsigned char utf8trie_t;
+#define BITNUM         0x07
+#define NEXTBYTE       0x08
+#define OFFLEN         0x30
+#define OFFLEN_SHIFT   4
+#define RIGHTPATH      0x40
+#define TRIENODE       0x80
+#define RIGHTNODE      0x40
+#define LEFTNODE       0x80
+
+/*
+ * utf8leaf_t
+ *
+ * The leaves of the trie are embedded in the trie, and so the same
+ * underlying datatype: unsigned char.
+ *
+ * leaf[0]: The unicode version, stored as a generation number that is
+ *          an index into utf8agetab[].  With this we can filter code
+ *          points based on the unicode version in which they were
+ *          defined.  The CCC of a non-defined code point is 0.
+ * leaf[1]: Canonical Combining Class. During normalization, we need
+ *          to do a stable sort into ascending order of all characters
+ *          with a non-zero CCC that occur between two characters with
+ *          a CCC of 0, or at the begin or end of a string.
+ *          The unicode standard guarantees that all CCC values are
+ *          between 0 and 254 inclusive, which leaves 255 available as
+ *          a special value.
+ *          Code points with CCC 0 are known as stoppers.
+ * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
+ *          start of a NUL-terminated string that is the decomposition
+ *          of the character.
+ *          The CCC of a decomposable character is the same as the CCC
+ *          of the first character of its decomposition.
+ *          Some characters decompose as the empty string: these are
+ *          characters with the Default_Ignorable_Code_Point property.
+ *          These do affect normalization, as they all have CCC 0.
+ *
+ * The decompositions in the trie have been fully expanded.
+ *
+ * Casefolding, if applicable, is also done using decompositions.
+ *
+ * The trie is constructed in such a way that leaves exist for all
+ * UTF-8 sequences that match the criteria from the "UTF-8 valid
+ * ranges" comment above, and only for those sequences.  Therefore a
+ * lookup in the trie can be used to validate the UTF-8 input.
+ */
+typedef const unsigned char utf8leaf_t;
+
+#define LEAF_GEN(LEAF) ((LEAF)[0])
+#define LEAF_CCC(LEAF) ((LEAF)[1])
+#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
+
+#define MINCCC         (0)
+#define MAXCCC         (254)
+#define STOPPER                (0)
+#define        DECOMPOSE       (255)
+
+/*
+ * Use trie to scan s, touching at most len bytes.
+ * Returns the leaf if one exists, NULL otherwise.
+ *
+ * A non-NULL return guarantees that the UTF-8 sequence starting at s
+ * is well-formed and corresponds to a known unicode code point.  The
+ * shorthand for this will be "is valid UTF-8 unicode".
+ */
+static utf8leaf_t *
+utf8nlookup(utf8data_t data, const char *s, size_t len)
+{
+       utf8trie_t      *trie = utf8data + data->offset;
+       int             offlen;
+       int             offset;
+       int             mask;
+       int             node;
+
+       if (!data)
+               return NULL;
+       if (len == 0)
+               return NULL;
+       node = 1;
+       while (node) {
+               offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
+               if (*trie & NEXTBYTE) {
+                       if (--len == 0)
+                               return NULL;
+                       s++;
+               }
+               mask = 1 << (*trie & BITNUM);
+               if (*s & mask) {
+                       /* Right leg */
+                       if (offlen) {
+                               /* Right node at offset of trie */
+                               node = (*trie & RIGHTNODE);
+                               offset = trie[offlen];
+                               while (--offlen) {
+                                       offset <<= 8;
+                                       offset |= trie[offlen];
+                               }
+                               trie += offset;
+                       } else if (*trie & RIGHTPATH) {
+                               /* Right node after this node */
+                               node = (*trie & TRIENODE);
+                               trie++;
+                       } else {
+                               /* No right node. */
+                               node = 0;
+                               trie = NULL;
+                       }
+               } else {
+                       /* Left leg */
+                       if (offlen) {
+                               /* Left node after this node. */
+                               node = (*trie & LEFTNODE);
+                               trie += offlen + 1;
+                       } else if (*trie & RIGHTPATH) {
+                               /* No left node. */
+                               node = 0;
+                               trie = NULL;
+                       } else {
+                               /* Left node after this node */
+                               node = (*trie & TRIENODE);
+                               trie++;
+                       }
+               }
+       }
+       return trie;
+}
+
+/*
+ * Use trie to scan s.
+ * Returns the leaf if one exists, NULL otherwise.
+ *
+ * Forwards to utf8nlookup().
+ */
+static utf8leaf_t *
+utf8lookup(utf8data_t data, const char *s)
+{
+       return utf8nlookup(data, s, (size_t)-1);
+}
+
+/*
+ * Maximum age of any character in s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ * Return 0 if only non-assigned code points are used.
+ */
+int
+utf8agemax(utf8data_t data, const char *s)
+{
+       utf8leaf_t      *leaf;
+       int             age = 0;
+       int             leaf_age;
+
+       if (!data)
+               return -1;
+       while (*s) {
+               if (!(leaf = utf8lookup(data, s)))
+                       return -1;
+               leaf_age = utf8agetab[LEAF_GEN(leaf)];
+               if (leaf_age <= data->maxage && leaf_age > age)
+                       age = leaf_age;
+               s += utf8clen(s);
+       }
+       return age;
+}
+
+/*
+ * Minimum age of any character in s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ * Return 0 if non-assigned code points are used.
+ */
+int
+utf8agemin(utf8data_t data, const char *s)
+{
+       utf8leaf_t      *leaf;
+       int             age = data->maxage;
+       int             leaf_age;
+
+       if (!data)
+               return -1;
+       while (*s) {
+               if (!(leaf = utf8lookup(data, s)))
+                       return -1;
+               leaf_age = utf8agetab[LEAF_GEN(leaf)];
+               if (leaf_age <= data->maxage && leaf_age < age)
+                       age = leaf_age;
+               s += utf8clen(s);
+       }
+       return age;
+}
+
+/*
+ * Maximum age of any character in s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+int
+utf8nagemax(utf8data_t data, const char *s, size_t len)
+{
+       utf8leaf_t      *leaf;
+       int             age = 0;
+       int             leaf_age;
+
+       if (!data)
+               return -1;
+        while (len && *s) {
+               if (!(leaf = utf8nlookup(data, s, len)))
+                       return -1;
+               leaf_age = utf8agetab[LEAF_GEN(leaf)];
+               if (leaf_age <= data->maxage && leaf_age > age)
+                       age = leaf_age;
+               len -= utf8clen(s);
+               s += utf8clen(s);
+       }
+       return age;
+}
+
+/*
+ * Maximum age of any character in s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+int
+utf8nagemin(utf8data_t data, const char *s, size_t len)
+{
+       utf8leaf_t      *leaf;
+       int             leaf_age;
+       int             age = data->maxage;
+
+       if (!data)
+               return -1;
+        while (len && *s) {
+               if (!(leaf = utf8nlookup(data, s, len)))
+                       return -1;
+               leaf_age = utf8agetab[LEAF_GEN(leaf)];
+               if (leaf_age <= data->maxage && leaf_age < age)
+                       age = leaf_age;
+               len -= utf8clen(s);
+               s += utf8clen(s);
+       }
+       return age;
+}
+
+/*
+ * Length of the normalization of s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ *
+ * A string of Default_Ignorable_Code_Point has length 0.
+ */
+ssize_t
+utf8len(utf8data_t data, const char *s)
+{
+       utf8leaf_t      *leaf;
+       size_t          ret = 0;
+
+       if (!data)
+               return -1;
+       while (*s) {
+               if (!(leaf = utf8lookup(data, s)))
+                       return -1;
+               if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
+                       ret += utf8clen(s);
+               else if (LEAF_CCC(leaf) == DECOMPOSE)
+                       ret += strlen(LEAF_STR(leaf));
+               else
+                       ret += utf8clen(s);
+               s += utf8clen(s);
+       }
+       return ret;
+}
+
+/*
+ * Length of the normalization of s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+ssize_t
+utf8nlen(utf8data_t data, const char *s, size_t len)
+{
+       utf8leaf_t      *leaf;
+       size_t          ret = 0;
+
+       if (!data)
+               return -1;
+       while (len && *s) {
+               if (!(leaf = utf8nlookup(data, s, len)))
+                       return -1;
+               if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
+                       ret += utf8clen(s);
+               else if (LEAF_CCC(leaf) == DECOMPOSE)
+                       ret += strlen(LEAF_STR(leaf));
+               else
+                       ret += utf8clen(s);
+               len -= utf8clen(s);
+               s += utf8clen(s);
+       }
+       return ret;
+}
+
+/*
+ * Set up an utf8cursor for use by utf8byte().
+ *
+ *   u8c    : pointer to cursor.
+ *   data   : utf8data_t to use for normalization.
+ *   s      : string.
+ *   len    : length of s.
+ *
+ * Returns -1 on error, 0 on success.
+ */
+int
+utf8ncursor(
+       struct utf8cursor *u8c,
+       utf8data_t      data,
+       const char      *s,
+       size_t          len)
+{
+       if (!data)
+               return -1;
+       if (!s)
+               return -1;
+       u8c->data = data;
+       u8c->s = s;
+       u8c->p = NULL;
+       u8c->ss = NULL;
+       u8c->sp = NULL;
+       u8c->len = len;
+       u8c->slen = 0;
+       u8c->ccc = STOPPER;
+       u8c->nccc = STOPPER;
+       /* Check we didn't clobber the maximum length. */
+       if (u8c->len != len)
+               return -1;
+       /* The first byte of s may not be an utf8 continuation. */
+       if (len > 0 && (*s & 0xC0) == 0x80)
+               return -1;
+       return 0;
+}
+
+/*
+ * Set up an utf8cursor for use by utf8byte().
+ *
+ *   u8c    : pointer to cursor.
+ *   data   : utf8data_t to use for normalization.
+ *   s      : NUL-terminated string.
+ *
+ * Returns -1 on error, 0 on success.
+ */
+int
+utf8cursor(
+       struct utf8cursor *u8c,
+       utf8data_t      data,
+       const char      *s)
+{
+       return utf8ncursor(u8c, data, s, (unsigned int)-1);
+}
+
+/*
+ * Get one byte from the normalized form of the string described by u8c.
+ *
+ * Returns the byte cast to an unsigned char on succes, and -1 on failure.
+ *
+ * The cursor keeps track of the location in the string in u8c->s.
+ * When a character is decomposed, the current location is stored in
+ * u8c->p, and u8c->s is set to the start of the decomposition. Note
+ * that bytes from a decomposition do not count against u8c->len.
+ *
+ * Characters are emitted if they match the current CCC in u8c->ccc.
+ * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
+ * and the function returns 0 in that case.
+ *
+ * Sorting by CCC is done by repeatedly scanning the string.  The
+ * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
+ * the start of the scan.  The first pass finds the lowest CCC to be
+ * emitted and stores it in u8c->nccc, the second pass emits the
+ * characters with this CCC and finds the next lowest CCC. This limits
+ * the number of passes to 1 + the number of different CCCs in the
+ * sequence being scanned.
+ *
+ * Therefore:
+ *  u8c->p  != NULL -> a decomposition is being scanned.
+ *  u8c->ss != NULL -> this is a repeating scan.
+ *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
+ */
+int
+utf8byte(struct utf8cursor *u8c)
+{
+       utf8leaf_t *leaf;
+       int ccc;
+
+       for (;;) {
+               /* Check for the end of a decomposed character. */
+               if (u8c->p && *u8c->s == '\0') {
+                       u8c->s = u8c->p;
+                       u8c->p = NULL;
+               }
+
+               /* Check for end-of-string. */
+               if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
+                       /* There is no next byte. */
+                       if (u8c->ccc == STOPPER)
+                               return 0;
+                       /* End-of-string during a scan counts as a stopper. */
+                       ccc = STOPPER;
+                       goto ccc_mismatch;
+               } else if ((*u8c->s & 0xC0) == 0x80) {
+                       /* This is a continuation of the current character. */
+                       if (!u8c->p)
+                               u8c->len--;
+                       return (unsigned char)*u8c->s++;
+               }
+
+               /* Look up the data for the current character. */
+               if (u8c->p)
+                       leaf = utf8lookup(u8c->data, u8c->s);
+               else
+                       leaf = utf8nlookup(u8c->data, u8c->s, u8c->len);
+
+               /* No leaf found implies that the input is a binary blob. */
+               if (!leaf)
+                       return -1;
+
+               /* Characters that are too new have CCC 0. */
+               if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
+                       ccc = STOPPER;
+               } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
+                       u8c->len -= utf8clen(u8c->s);
+                       u8c->p = u8c->s + utf8clen(u8c->s);
+                       u8c->s = LEAF_STR(leaf);
+                       /* Empty decomposition implies CCC 0. */
+                       if (*u8c->s == '\0') {
+                               if (u8c->ccc == STOPPER)
+                                       continue;
+                               ccc = STOPPER;
+                               goto ccc_mismatch;
+                       }
+                       leaf = utf8lookup(u8c->data, u8c->s);
+                       ccc = LEAF_CCC(leaf);
+               }
+
+               /*
+                * If this is not a stopper, then see if it updates
+                * the next canonical class to be emitted.
+                */
+               if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
+                       u8c->nccc = ccc;
+
+               /*
+                * Return the current byte if this is the current
+                * combining class.
+                */
+               if (ccc == u8c->ccc) {
+                       if (!u8c->p)
+                               u8c->len--;
+                       return (unsigned char)*u8c->s++;
+               }
+
+               /* Current combining class mismatch. */
+       ccc_mismatch:
+               if (u8c->nccc == STOPPER) {
+                       /*
+                        * Scan forward for the first canonical class
+                        * to be emitted.  Save the position from
+                        * which to restart.
+                        */
+                       u8c->ccc = MINCCC - 1;
+                       u8c->nccc = ccc;
+                       u8c->sp = u8c->p;
+                       u8c->ss = u8c->s;
+                       u8c->slen = u8c->len;
+                       if (!u8c->p)
+                               u8c->len -= utf8clen(u8c->s);
+                       u8c->s += utf8clen(u8c->s);
+               } else if (ccc != STOPPER) {
+                       /* Not a stopper, and not the ccc we're emitting. */
+                       if (!u8c->p)
+                               u8c->len -= utf8clen(u8c->s);
+                       u8c->s += utf8clen(u8c->s);
+               } else if (u8c->nccc != MAXCCC + 1) {
+                       /* At a stopper, restart for next ccc. */
+                       u8c->ccc = u8c->nccc;
+                       u8c->nccc = MAXCCC + 1;
+                       u8c->s = u8c->ss;
+                       u8c->p = u8c->sp;
+                       u8c->len = u8c->slen;
+               } else {
+                       /* All done, proceed from here. */
+                       u8c->ccc = STOPPER;
+                       u8c->nccc = STOPPER;
+                       u8c->sp = NULL;
+                       u8c->ss = NULL;
+                       u8c->slen = 0;
+               }
+       }
+}
+
+const struct utf8data *
+utf8nfkdi(unsigned int maxage)
+{
+       int i = sizeof(utf8nfkdidata)/sizeof(utf8nfkdidata[0]) - 1;
+
+       while (maxage < utf8nfkdidata[i].maxage)
+               i--;
+       if (maxage > utf8nfkdidata[i].maxage)
+               return NULL;
+       return &utf8nfkdidata[i];
+}
+
+const struct utf8data *
+utf8nfkdicf(unsigned int maxage)
+{
+       int i = sizeof(utf8nfkdicfdata)/sizeof(utf8nfkdicfdata[0]) - 1;
+
+       while (maxage < utf8nfkdicfdata[i].maxage)
+               i--;
+       if (maxage > utf8nfkdicfdata[i].maxage)
+               return NULL;
+       return &utf8nfkdicfdata[i];
+}
diff --git a/support/mkutf8data.c b/support/mkutf8data.c
new file mode 100644
index 0000000..e5c3507
--- /dev/null
+++ b/support/mkutf8data.c
@@ -0,0 +1,3232 @@
+/*
+ * Copyright (c) 2014 SGI.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/* Generator for a compact trie for unicode normalization */
+
+#include <sys/types.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+
+/* Default names of the in- and output files. */
+
+#define AGE_NAME       "DerivedAge.txt"
+#define CCC_NAME       "DerivedCombiningClass.txt"
+#define PROP_NAME      "DerivedCoreProperties.txt"
+#define DATA_NAME      "UnicodeData.txt"
+#define FOLD_NAME      "CaseFolding.txt"
+#define NORM_NAME      "NormalizationCorrections.txt"
+#define TEST_NAME      "NormalizationTest.txt"
+#define UTF8_NAME      "utf8data.h"
+
+const char     *age_name  = AGE_NAME;
+const char     *ccc_name  = CCC_NAME;
+const char     *prop_name = PROP_NAME;
+const char     *data_name = DATA_NAME;
+const char     *fold_name = FOLD_NAME;
+const char     *norm_name = NORM_NAME;
+const char     *test_name = TEST_NAME;
+const char     *utf8_name = UTF8_NAME;
+
+int verbose = 0;
+
+/* An arbitrary line size limit on input lines. */
+
+#define LINESIZE       1024
+char line[LINESIZE];
+char buf0[LINESIZE];
+char buf1[LINESIZE];
+char buf2[LINESIZE];
+char buf3[LINESIZE];
+
+const char *argv0;
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Unicode version numbers consist of three parts: major, minor, and a
+ * revision.  These numbers are packed into an unsigned int to obtain
+ * a single version number.
+ *
+ * To save space in the generated trie, the unicode version is not
+ * stored directly, instead we calculate a generation number from the
+ * unicode versions seen in the DerivedAge file, and use that as an
+ * index into a table of unicode versions.
+ */
+#define UNICODE_MAJ_SHIFT              (16)
+#define UNICODE_MIN_SHIFT              (8)
+
+#define UNICODE_MAJ_MAX                        ((unsigned short)-1)
+#define UNICODE_MIN_MAX                        ((unsigned char)-1)
+#define UNICODE_REV_MAX                        ((unsigned char)-1)
+
+#define UNICODE_AGE(MAJ,MIN,REV)                       \
+       (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |   \
+        ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |   \
+        ((unsigned int)(REV)))
+
+unsigned int *ages;
+int ages_count;
+
+unsigned int unicode_maxage;
+
+static int
+age_valid(unsigned int major, unsigned int minor, unsigned int revision)
+{
+       if (major > UNICODE_MAJ_MAX)
+               return 0;
+       if (minor > UNICODE_MIN_MAX)
+               return 0;
+       if (revision > UNICODE_REV_MAX)
+               return 0;
+       return 1;
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * utf8trie_t
+ *
+ * A compact binary tree, used to decode UTF-8 characters.
+ *
+ * Internal nodes are one byte for the node itself, and up to three
+ * bytes for an offset into the tree.  The first byte contains the
+ * following information:
+ *  NEXTBYTE  - flag        - advance to next byte if set
+ *  BITNUM    - 3 bit field - the bit number to tested
+ *  OFFLEN    - 2 bit field - number of bytes in the offset
+ * if offlen == 0 (non-branching node)
+ *  RIGHTPATH - 1 bit field - set if the following node is for the
+ *                            right-hand path (tested bit is set)
+ *  TRIENODE  - 1 bit field - set if the following node is an internal
+ *                            node, otherwise it is a leaf node
+ * if offlen != 0 (branching node)
+ *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
+ *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
+ *
+ * Due to the way utf8 works, there cannot be branching nodes with
+ * NEXTBYTE set, and moreover those nodes always have a righthand
+ * descendant.
+ */
+typedef unsigned char utf8trie_t;
+#define BITNUM         0x07
+#define NEXTBYTE       0x08
+#define OFFLEN         0x30
+#define OFFLEN_SHIFT   4
+#define RIGHTPATH      0x40
+#define TRIENODE       0x80
+#define RIGHTNODE      0x40
+#define LEFTNODE       0x80
+
+/*
+ * utf8leaf_t
+ *
+ * The leaves of the trie are embedded in the trie, and so the same
+ * underlying datatype, unsigned char.
+ *
+ * leaf[0]: The unicode version, stored as a generation number that is
+ *          an index into utf8agetab[].  With this we can filter code
+ *          points based on the unicode version in which they were
+ *          defined.  The CCC of a non-defined code point is 0.
+ * leaf[1]: Canonical Combining Class. During normalization, we need
+ *          to do a stable sort into ascending order of all characters
+ *          with a non-zero CCC that occur between two characters with
+ *          a CCC of 0, or at the begin or end of a string.
+ *          The unicode standard guarantees that all CCC values are
+ *          between 0 and 254 inclusive, which leaves 255 available as
+ *          a special value.
+ *          Code points with CCC 0 are known as stoppers.
+ * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
+ *          start of a NUL-terminated string that is the decomposition
+ *          of the character.
+ *          The CCC of a decomposable character is the same as the CCC
+ *          of the first character of its decomposition.
+ *          Some characters decompose as the empty string: these are
+ *          characters with the Default_Ignorable_Code_Point property.
+ *          These do affect normalization, as they all have CCC 0.
+ *
+ * The decompositions in the trie have been fully expanded.
+ *
+ * Casefolding, if applicable, is also done using decompositions.
+ */
+typedef unsigned char utf8leaf_t;
+
+#define LEAF_GEN(LEAF) ((LEAF)[0])
+#define LEAF_CCC(LEAF) ((LEAF)[1])
+#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
+
+#define MAXGEN         (255)
+
+#define MINCCC         (0)
+#define MAXCCC         (254)
+#define STOPPER                (0)
+#define        DECOMPOSE       (255)
+
+struct tree;
+static utf8leaf_t *utf8nlookup(struct tree *, const char *, size_t);
+static utf8leaf_t *utf8lookup(struct tree *, const char *);
+
+unsigned char *utf8data;
+size_t utf8data_size;
+
+utf8trie_t *nfkdi;
+utf8trie_t *nfkdicf;
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * UTF8 valid ranges.
+ *
+ * The UTF-8 encoding spreads the bits of a 32bit word over several
+ * bytes. This table gives the ranges that can be held and how they'd
+ * be represented.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * There is an additional requirement on UTF-8, in that only the
+ * shortest representation of a 32bit value is to be used.  A decoder
+ * must not decode sequences that do not satisfy this requirement.
+ * Thus the allowed ranges have a lower bound.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
+ * 17 planes of 65536 values.  This limits the sequences actually seen
+ * even more, to just the following.
+ *
+ *          0 -     0x7f: 0                     0x7f
+ *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
+ *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
+ *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
+ *
+ * Even within those ranges not all values are allowed: the surrogates
+ * 0xd800 - 0xdfff should never be seen.
+ *
+ * Note that the longest sequence seen with valid usage is 4 bytes,
+ * the same a single UTF-32 character.  This makes the UTF-8
+ * representation of Unicode strictly smaller than UTF-32.
+ *
+ * The shortest sequence requirement was introduced by:
+ *    Corrigendum #1: UTF-8 Shortest Form
+ * It can be found here:
+ *    http://www.unicode.org/versions/corrigendum1.html
+ *
+ */
+
+#define UTF8_2_BITS     0xC0
+#define UTF8_3_BITS     0xE0
+#define UTF8_4_BITS     0xF0
+#define UTF8_N_BITS     0x80
+#define UTF8_2_MASK     0xE0
+#define UTF8_3_MASK     0xF0
+#define UTF8_4_MASK     0xF8
+#define UTF8_N_MASK     0xC0
+#define UTF8_V_MASK     0x3F
+#define UTF8_V_SHIFT    6
+
+static int
+utf8key(unsigned int key, char keyval[])
+{
+       int keylen;
+
+       if (key < 0x80) {
+               keyval[0] = key;
+               keylen = 1;
+       } else if (key < 0x800) {
+               keyval[1] = key & UTF8_V_MASK;
+               keyval[1] |= UTF8_N_BITS;
+               key >>= UTF8_V_SHIFT;
+               keyval[0] = key;
+               keyval[0] |= UTF8_2_BITS;
+               keylen = 2;
+       } else if (key < 0x10000) {
+               keyval[2] = key & UTF8_V_MASK;
+               keyval[2] |= UTF8_N_BITS;
+               key >>= UTF8_V_SHIFT;
+               keyval[1] = key & UTF8_V_MASK;
+               keyval[1] |= UTF8_N_BITS;
+               key >>= UTF8_V_SHIFT;
+               keyval[0] = key;
+               keyval[0] |= UTF8_3_BITS;
+               keylen = 3;
+       } else if (key < 0x110000) {
+               keyval[3] = key & UTF8_V_MASK;
+               keyval[3] |= UTF8_N_BITS;
+               key >>= UTF8_V_SHIFT;
+               keyval[2] = key & UTF8_V_MASK;
+               keyval[2] |= UTF8_N_BITS;
+               key >>= UTF8_V_SHIFT;
+               keyval[1] = key & UTF8_V_MASK;
+               keyval[1] |= UTF8_N_BITS;
+               key >>= UTF8_V_SHIFT;
+               keyval[0] = key;
+               keyval[0] |= UTF8_4_BITS;
+               keylen = 4;
+       } else {
+               printf("%#x: illegal key\n", key);
+               keylen = 0;
+       }
+       return keylen;
+}
+
+static unsigned int
+utf8code(const char *str)
+{
+       const unsigned char *s = (const unsigned char*)str;
+       unsigned int unichar = 0;
+
+       if (*s < 0x80) {
+               unichar = *s;
+       } else if (*s < UTF8_3_BITS) {
+               unichar = *s++ & 0x1F;
+               unichar <<= UTF8_V_SHIFT;
+               unichar |= *s & 0x3F;
+       } else if (*s < UTF8_4_BITS) {
+               unichar = *s++ & 0x0F;
+               unichar <<= UTF8_V_SHIFT;
+               unichar |= *s++ & 0x3F;
+               unichar <<= UTF8_V_SHIFT;
+               unichar |= *s & 0x3F;
+       } else {
+               unichar = *s++ & 0x0F;
+               unichar <<= UTF8_V_SHIFT;
+               unichar |= *s++ & 0x3F;
+               unichar <<= UTF8_V_SHIFT;
+               unichar |= *s++ & 0x3F;
+               unichar <<= UTF8_V_SHIFT;
+               unichar |= *s & 0x3F;
+       }
+       return unichar;
+}
+
+static int
+utf32valid(unsigned int unichar)
+{
+       return unichar < 0x110000;
+}
+
+#define NODE 1
+#define LEAF 0
+
+struct tree {
+       void *root;
+       int childnode;
+       const char *type;
+       unsigned int maxage;
+       struct tree *next;
+       int (*leaf_equal)(void *, void *);
+       void (*leaf_print)(void *, int);
+       int (*leaf_mark)(void *);
+       int (*leaf_size)(void *);
+       int *(*leaf_index)(struct tree *, void *);
+       unsigned char *(*leaf_emit)(void *, unsigned char *);
+       int leafindex[0x110000];
+       int index;
+};
+
+struct node {
+       int index;
+       int offset;
+       int mark;
+       int size;
+       struct node *parent;
+       void *left;
+       void *right;
+       unsigned char bitnum;
+       unsigned char nextbyte;
+       unsigned char leftnode;
+       unsigned char rightnode;
+       unsigned int keybits;
+       unsigned int keymask;
+};
+
+/*
+ * Example lookup function for a tree.
+ */
+static void *
+lookup(struct tree *tree, const char *key)
+{
+       struct node *node;
+       void *leaf = NULL;
+
+       node = tree->root;
+       while (!leaf && node) {
+               if (node->nextbyte)
+                       key++;
+               if (*key & (1 << (node->bitnum & 7))) {
+                       /* Right leg */
+                       if (node->rightnode == NODE) {
+                               node = node->right;
+                       } else if (node->rightnode == LEAF) {
+                               leaf = node->right;
+                       } else {
+                               node = NULL;
+                       }
+               } else {
+                       /* Left leg */
+                       if (node->leftnode == NODE) {
+                               node = node->left;
+                       } else if (node->leftnode == LEAF) {
+                               leaf = node->left;
+                       } else {
+                               node = NULL;
+                       }
+               }
+       }
+
+       return leaf;
+}
+
+/*
+ * A simple non-recursive tree walker: keep track of visits to the
+ * left and right branches in the leftmask and rightmask.
+ */
+static void
+tree_walk(struct tree *tree)
+{
+       struct node *node;
+       unsigned int leftmask;
+       unsigned int rightmask;
+       unsigned int bitmask;
+       int indent = 1;
+       int nodes, singletons, leaves;
+
+       nodes = singletons = leaves = 0;
+
+       printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
+       if (tree->childnode == LEAF) {
+               assert(tree->root);
+               tree->leaf_print(tree->root, indent);
+               leaves = 1;
+       } else {
+               assert(tree->childnode == NODE);
+               node = tree->root;
+               leftmask = rightmask = 0;
+               while (node) {
+                       printf("%*snode @ %p bitnum %d nextbyte %d"
+                              " left %p right %p mask %x bits %x\n",
+                               indent, "", node,
+                               node->bitnum, node->nextbyte,
+                               node->left, node->right,
+                               node->keymask, node->keybits);
+                       nodes += 1;
+                       if (!(node->left && node->right))
+                               singletons += 1;
+
+                       while (node) {
+                               bitmask = 1 << node->bitnum;
+                               if ((leftmask & bitmask) == 0) {
+                                       leftmask |= bitmask;
+                                       if (node->leftnode == LEAF) {
+                                               assert(node->left);
+                                               tree->leaf_print(node->left,
+                                                                indent+1);
+                                               leaves += 1;
+                                       } else if (node->left) {
+                                               assert(node->leftnode == NODE);
+                                               indent += 1;
+                                               node = node->left;
+                                               break;
+                                       }
+                               }
+                               if ((rightmask & bitmask) == 0) {
+                                       rightmask |= bitmask;
+                                       if (node->rightnode == LEAF) {
+                                               assert(node->right);
+                                               tree->leaf_print(node->right,
+                                                                indent+1);
+                                               leaves += 1;
+                                       } else if (node->right) {
+                                               assert(node->rightnode==NODE);
+                                               indent += 1;
+                                               node = node->right;
+                                               break;
+                                       }
+                               }
+                               leftmask &= ~bitmask;
+                               rightmask &= ~bitmask;
+                               node = node->parent;
+                               indent -= 1;
+                       }
+               }
+       }
+       printf("nodes %d leaves %d singletons %d\n",
+              nodes, leaves, singletons);
+}
+
+/*
+ * Allocate an initialize a new internal node.
+ */
+static struct node *
+alloc_node(struct node *parent)
+{
+       struct node *node;
+       int bitnum;
+
+       node = malloc(sizeof(*node));
+       node->left = node->right = NULL;
+       node->parent = parent;
+       node->leftnode = NODE;
+       node->rightnode = NODE;
+       node->keybits = 0;
+       node->keymask = 0;
+       node->mark = 0;
+       node->index = 0;
+       node->offset = -1;
+       node->size = 4;
+
+       if (node->parent) {
+               bitnum = parent->bitnum;
+               if ((bitnum & 7) == 0) {
+                       node->bitnum = bitnum + 7 + 8;
+                       node->nextbyte = 1;
+               } else {
+                       node->bitnum = bitnum - 1;
+                       node->nextbyte = 0;
+               }
+       } else {
+               node->bitnum = 7;
+               node->nextbyte = 0;
+       }
+
+        return node;
+}
+
+/*
+ * Insert a new leaf into the tree, and collapse any subtrees that are
+ * fully populated and end in identical leaves. A nextbyte tagged
+ * internal node will not be removed to preserve the tree's integrity.
+ * Note that due to the structure of utf8, no nextbyte tagged node
+ * will be a candidate for removal.
+ */
+static int
+insert(struct tree *tree, char *key, int keylen, void *leaf)
+{
+       struct node *node;
+       struct node *parent;
+       void **cursor;
+       int keybits;
+
+       assert(keylen >= 1 && keylen <= 4);
+
+       node = NULL;
+       cursor = &tree->root;
+       keybits = 8 * keylen;
+
+       /* Insert, creating path along the way. */
+       while (keybits) {
+               if (!*cursor)
+                       *cursor = alloc_node(node);
+               node = *cursor;
+               if (node->nextbyte)
+                       key++;
+               if (*key & (1 << (node->bitnum & 7)))
+                       cursor = &node->right;
+               else
+                       cursor = &node->left;
+               keybits--;
+       }
+       *cursor = leaf;
+
+       /* Merge subtrees if possible. */
+       while (node) {
+               if (*key & (1 << (node->bitnum & 7)))
+                       node->rightnode = LEAF;
+               else
+                       node->leftnode = LEAF;
+               if (node->nextbyte)
+                       break;
+               if (node->leftnode == NODE || node->rightnode == NODE)
+                       break;
+               assert(node->left);
+               assert(node->right);
+               /* Compare */
+               if (! tree->leaf_equal(node->left, node->right))
+                       break;
+               /* Keep left, drop right leaf. */
+               leaf = node->left;
+               /* Check in parent */
+               parent = node->parent;
+               if (!parent) {
+                       /* root of tree! */
+                       tree->root = leaf;
+                       tree->childnode = LEAF;
+               } else if (parent->left == node) {
+                       parent->left = leaf;
+                       parent->leftnode = LEAF;
+                       if (parent->right) {
+                               parent->keymask = 0;
+                               parent->keybits = 0;
+                       } else {
+                               parent->keymask |= (1 << node->bitnum);
+                       }
+               } else if (parent->right == node) {
+                       parent->right = leaf;
+                       parent->rightnode = LEAF;
+                       if (parent->left) {
+                               parent->keymask = 0;
+                               parent->keybits = 0;
+                       } else {
+                               parent->keymask |= (1 << node->bitnum);
+                               parent->keybits |= (1 << node->bitnum);
+                       }
+               } else {
+                       /* internal tree error */
+                       assert(0);
+               }
+               free(node);
+               node = parent;
+       }
+
+       /* Propagate keymasks up along singleton chains. */
+       while (node) {
+               parent = node->parent;
+               if (!parent)
+                       break;
+               /* Nix the mask for parents with two children. */
+               if (node->keymask == 0) {
+                       parent->keymask = 0;
+                       parent->keybits = 0;
+               } else if (parent->left && parent->right) {
+                       parent->keymask = 0;
+                       parent->keybits = 0;
+               } else {
+                       assert((parent->keymask & node->keymask) == 0);
+                       parent->keymask |= node->keymask;
+                       parent->keymask |= (1 << parent->bitnum);
+                       parent->keybits |= node->keybits;
+                       if (parent->right)
+                               parent->keybits |= (1 << parent->bitnum);
+               }
+               node = parent;
+       }
+
+       return 0;
+}
+
+/*
+ * Prune internal nodes.
+ *
+ * Fully populated subtrees that end at the same leaf have already
+ * been collapsed.  There are still internal nodes that have for both
+ * their left and right branches a sequence of singletons that make
+ * identical choices and end in identical leaves.  The keymask and
+ * keybits collected in the nodes describe the choices made in these
+ * singleton chains.  When they are identical for the left and right
+ * branch of a node, and the two leaves comare identical, the node in
+ * question can be removed.
+ *
+ * Note that nodes with the nextbyte tag set will not be removed by
+ * this to ensure tree integrity.  Note as well that the structure of
+ * utf8 ensures that these nodes would not have been candidates for
+ * removal in any case.
+ */
+static void
+prune(struct tree *tree)
+{
+       struct node *node;
+       struct node *left;
+       struct node *right;
+       struct node *parent;
+       void *leftleaf;
+       void *rightleaf;
+       unsigned int leftmask;
+       unsigned int rightmask;
+       unsigned int bitmask;
+       int count;
+
+       if (verbose > 0)
+               printf("Pruning %s_%x\n", tree->type, tree->maxage);
+
+       count = 0;
+       if (tree->childnode == LEAF)
+               return;
+       if (!tree->root)
+               return;
+
+       leftmask = rightmask = 0;
+       node = tree->root;
+       while (node) {
+               if (node->nextbyte)
+                       goto advance;
+               if (node->leftnode == LEAF)
+                       goto advance;
+               if (node->rightnode == LEAF)
+                       goto advance;
+               if (!node->left)
+                       goto advance;
+               if (!node->right)
+                       goto advance;
+               left = node->left;
+               right = node->right;
+               if (left->keymask == 0)
+                       goto advance;
+               if (right->keymask == 0)
+                       goto advance;
+               if (left->keymask != right->keymask)
+                       goto advance;
+               if (left->keybits != right->keybits)
+                       goto advance;
+               leftleaf = NULL;
+               while (!leftleaf) {
+                       assert(left->left || left->right);
+                       if (left->leftnode == LEAF)
+                               leftleaf = left->left;
+                       else if (left->rightnode == LEAF)
+                               leftleaf = left->right;
+                       else if (left->left)
+                               left = left->left;
+                       else if (left->right)
+                               left = left->right;
+                       else
+                               assert(0);
+               }
+               rightleaf = NULL;
+               while (!rightleaf) {
+                       assert(right->left || right->right);
+                       if (right->leftnode == LEAF)
+                               rightleaf = right->left;
+                       else if (right->rightnode == LEAF)
+                               rightleaf = right->right;
+                       else if (right->left)
+                               right = right->left;
+                       else if (right->right)
+                               right = right->right;
+                       else
+                               assert(0);
+               }
+               if (! tree->leaf_equal(leftleaf, rightleaf))
+                       goto advance;
+               /*
+                * This node has identical singleton-only subtrees.
+                * Remove it.
+                */
+               parent = node->parent;
+               left = node->left;
+               right = node->right;
+               if (parent->left == node)
+                       parent->left = left;
+               else if (parent->right == node)
+                       parent->right = left;
+               else
+                       assert(0);
+               left->parent = parent;
+               left->keymask |= (1 << node->bitnum);
+               node->left = NULL;
+               while (node) {
+                       bitmask = 1 << node->bitnum;
+                       leftmask &= ~bitmask;
+                       rightmask &= ~bitmask;
+                       if (node->leftnode == NODE && node->left) {
+                               left = node->left;
+                               free(node);
+                               count++;
+                               node = left;
+                       } else if (node->rightnode == NODE && node->right) {
+                               right = node->right;
+                               free(node);
+                               count++;
+                               node = right;
+                       } else {
+                               node = NULL;
+                       }
+               }
+               /* Propagate keymasks up along singleton chains. */
+               node = parent;
+               /* Force re-check */
+               bitmask = 1 << node->bitnum;
+               leftmask &= ~bitmask;
+               rightmask &= ~bitmask;
+               for (;;) {
+                       if (node->left && node->right)
+                               break;
+                       if (node->left) {
+                               left = node->left;
+                               node->keymask |= left->keymask;
+                               node->keybits |= left->keybits;
+                       }
+                       if (node->right) {
+                               right = node->right;
+                               node->keymask |= right->keymask;
+                               node->keybits |= right->keybits;
+                       }
+                       node->keymask |= (1 << node->bitnum);
+                       node = node->parent;
+                       /* Force re-check */
+                       bitmask = 1 << node->bitnum;
+                       leftmask &= ~bitmask;
+                       rightmask &= ~bitmask;
+               }
+       advance:
+               bitmask = 1 << node->bitnum;
+               if ((leftmask & bitmask) == 0 &&
+                   node->leftnode == NODE &&
+                   node->left) {
+                       leftmask |= bitmask;
+                       node = node->left;
+               } else if ((rightmask & bitmask) == 0 &&
+                          node->rightnode == NODE &&
+                          node->right) {
+                       rightmask |= bitmask;
+                       node = node->right;
+               } else {
+                       leftmask &= ~bitmask;
+                       rightmask &= ~bitmask;
+                       node = node->parent;
+               }
+       }
+       if (verbose > 0)
+               printf("Pruned %d nodes\n", count);
+}
+
+/*
+ * Mark the nodes in the tree that lead to leaves that must be
+ * emitted.
+ */
+static void
+mark_nodes(struct tree *tree)
+{
+       struct node *node;
+       struct node *n;
+       unsigned int leftmask;
+       unsigned int rightmask;
+       unsigned int bitmask;
+       int marked;
+
+       marked = 0;
+       if (verbose > 0)
+               printf("Marking %s_%x\n", tree->type, tree->maxage);
+       if (tree->childnode == LEAF)
+               goto done;
+
+       assert(tree->childnode == NODE);
+       node = tree->root;
+       leftmask = rightmask = 0;
+       while (node) {
+               bitmask = 1 << node->bitnum;
+               if ((leftmask & bitmask) == 0) {
+                       leftmask |= bitmask;
+                       if (node->leftnode == LEAF) {
+                               assert(node->left);
+                               if (tree->leaf_mark(node->left)) {
+                                       n = node;
+                                       while (n && !n->mark) {
+                                               marked++;
+                                               n->mark = 1;
+                                               n = n->parent;
+                                       }
+                               }
+                       } else if (node->left) {
+                               assert(node->leftnode == NODE);
+                               node = node->left;
+                               continue;
+                       }
+               }
+               if ((rightmask & bitmask) == 0) {
+                       rightmask |= bitmask;
+                       if (node->rightnode == LEAF) {
+                               assert(node->right);
+                               if (tree->leaf_mark(node->right)) {
+                                       n = node;
+                                       while (n && !n->mark) {
+                                               marked++;
+                                               n->mark = 1;
+                                               n = n->parent;
+                                       }
+                               }
+                       } else if (node->right) {
+                               assert(node->rightnode==NODE);
+                               node = node->right;
+                               continue;
+                       }
+               }
+               leftmask &= ~bitmask;
+               rightmask &= ~bitmask;
+               node = node->parent;
+       }
+
+       /* second pass: left siblings and singletons */
+
+       assert(tree->childnode == NODE);
+       node = tree->root;
+       leftmask = rightmask = 0;
+       while (node) {
+               bitmask = 1 << node->bitnum;
+               if ((leftmask & bitmask) == 0) {
+                       leftmask |= bitmask;
+                       if (node->leftnode == LEAF) {
+                               assert(node->left);
+                               if (tree->leaf_mark(node->left)) {
+                                       n = node;
+                                       while (n && !n->mark) {
+                                               marked++;
+                                               n->mark = 1;
+                                               n = n->parent;
+                                       }
+                               }
+                       } else if (node->left) {
+                               assert(node->leftnode == NODE);
+                               node = node->left;
+                               if (!node->mark && node->parent->mark) {
+                                       marked++;
+                                       node->mark = 1;
+                               }
+                               continue;
+                       }
+               }
+               if ((rightmask & bitmask) == 0) {
+                       rightmask |= bitmask;
+                       if (node->rightnode == LEAF) {
+                               assert(node->right);
+                               if (tree->leaf_mark(node->right)) {
+                                       n = node;
+                                       while (n && !n->mark) {
+                                               marked++;
+                                               n->mark = 1;
+                                               n = n->parent;
+                                       }
+                               }
+                       } else if (node->right) {
+                               assert(node->rightnode==NODE);
+                               node = node->right;
+                               if (!node->mark && node->parent->mark &&
+                                   !node->parent->left) {
+                                       marked++;
+                                       node->mark = 1;
+                               }
+                               continue;
+                       }
+               }
+               leftmask &= ~bitmask;
+               rightmask &= ~bitmask;
+               node = node->parent;
+       }
+done:
+       if (verbose > 0)
+               printf("Marked %d nodes\n", marked);
+}
+
+/*
+ * Compute the index of each node and leaf, which is the offset in the
+ * emitted trie.  These value must be pre-computed because relative
+ * offsets between nodes are used to navigate the tree.
+ */
+static int
+index_nodes(struct tree *tree, int index)
+{
+       struct node *node;
+       unsigned int leftmask;
+       unsigned int rightmask;
+       unsigned int bitmask;
+       int count;
+       int indent;
+
+       /* Align to a cache line (or half a cache line?). */
+       while (index % 64)
+               index++;
+       tree->index = index;
+       indent = 1;
+       count = 0;
+
+       if (verbose > 0)
+               printf("Indexing %s_%x: %d", tree->type, tree->maxage, index);
+       if (tree->childnode == LEAF) {
+               index += tree->leaf_size(tree->root);
+               goto done;
+       }
+
+       assert(tree->childnode == NODE);
+       node = tree->root;
+       leftmask = rightmask = 0;
+       while (node) {
+               if (!node->mark)
+                       goto skip;
+               count++;
+               if (node->index != index)
+                       node->index = index;
+               index += node->size;
+skip:
+               while (node) {
+                       bitmask = 1 << node->bitnum;
+                       if (node->mark && (leftmask & bitmask) == 0) {
+                               leftmask |= bitmask;
+                               if (node->leftnode == LEAF) {
+                                       assert(node->left);
+                                       *tree->leaf_index(tree, node->left) =
+                                                                       index;
+                                       index += tree->leaf_size(node->left);
+                                       count++;
+                               } else if (node->left) {
+                                       assert(node->leftnode == NODE);
+                                       indent += 1;
+                                       node = node->left;
+                                       break;
+                               }
+                       }
+                       if (node->mark && (rightmask & bitmask) == 0) {
+                               rightmask |= bitmask;
+                               if (node->rightnode == LEAF) {
+                                       assert(node->right);
+                                       *tree->leaf_index(tree, node->right) = 
index;
+                                       index += tree->leaf_size(node->right);
+                                       count++;
+                               } else if (node->right) {
+                                       assert(node->rightnode==NODE);
+                                       indent += 1;
+                                       node = node->right;
+                                       break;
+                               }
+                       }
+                       leftmask &= ~bitmask;
+                       rightmask &= ~bitmask;
+                       node = node->parent;
+                       indent -= 1;
+               }
+       }
+done:
+       /* Round up to a multiple of 16 */
+       while (index % 16)
+               index++;
+       if (verbose > 0)
+               printf("Final index %d\n", index);
+       return index;
+}
+
+/*
+ * Compute the size of nodes and leaves. We start by assuming that
+ * each node needs to store a three-byte offset. The indexes of the
+ * nodes are calculated based on that, and then this function is
+ * called to see if the sizes of some nodes can be reduced.  This is
+ * repeated until no more changes are seen.
+ */
+static int
+size_nodes(struct tree *tree)
+{
+       struct tree *next;
+       struct node *node;
+       struct node *right;
+       struct node *n;
+       unsigned int leftmask;
+       unsigned int rightmask;
+       unsigned int bitmask;
+       unsigned int pathbits;
+       unsigned int pathmask;
+       int changed;
+       int offset;
+       int size;
+       int indent;
+
+       indent = 1;
+       changed = 0;
+       size = 0;
+
+       if (verbose > 0)
+               printf("Sizing %s_%x", tree->type, tree->maxage);
+       if (tree->childnode == LEAF)
+               goto done;
+
+       assert(tree->childnode == NODE);
+       pathbits = 0;
+       pathmask = 0;
+       node = tree->root;
+       leftmask = rightmask = 0;
+       while (node) {
+               if (!node->mark)
+                       goto skip;
+               offset = 0;
+               if (!node->left || !node->right) {
+                       size = 1;
+               } else {
+                       if (node->rightnode == NODE) {
+                               right = node->right;
+                               next = tree->next;
+                               while (!right->mark) {
+                                       assert(next);
+                                       n = next->root;
+                                       while (n->bitnum != node->bitnum) {
+                                               if (pathbits & (1<<n->bitnum))
+                                                       n = n->right;
+                                               else
+                                                       n = n->left;
+                                       }
+                                       n = n->right;
+                                       assert(right->bitnum == n->bitnum);
+                                       right = n;
+                                       next = next->next;
+                               }
+                               offset = right->index - node->index;
+                       } else {
+                               offset = *tree->leaf_index(tree, node->right);
+                               offset -= node->index;
+                       }
+                       assert(offset >= 0);
+                       assert(offset <= 0xffffff);
+                       if (offset <= 0xff) {
+                               size = 2;
+                       } else if (offset <= 0xffff) {
+                               size = 3;
+                       } else { /* offset <= 0xffffff */
+                               size = 4;
+                       }
+               }
+               if (node->size != size || node->offset != offset) {
+                       node->size = size;
+                       node->offset = offset;
+                       changed++;
+               }
+skip:
+               while (node) {
+                       bitmask = 1 << node->bitnum;
+                       pathmask |= bitmask;
+                       if (node->mark && (leftmask & bitmask) == 0) {
+                               leftmask |= bitmask;
+                               if (node->leftnode == LEAF) {
+                                       assert(node->left);
+                               } else if (node->left) {
+                                       assert(node->leftnode == NODE);
+                                       indent += 1;
+                                       node = node->left;
+                                       break;
+                               }
+                       }
+                       if (node->mark && (rightmask & bitmask) == 0) {
+                               rightmask |= bitmask;
+                               pathbits |= bitmask;
+                               if (node->rightnode == LEAF) {
+                                       assert(node->right);
+                               } else if (node->right) {
+                                       assert(node->rightnode==NODE);
+                                       indent += 1;
+                                       node = node->right;
+                                       break;
+                               }
+                       }
+                       leftmask &= ~bitmask;
+                       rightmask &= ~bitmask;
+                       pathmask &= ~bitmask;
+                       pathbits &= ~bitmask;
+                       node = node->parent;
+                       indent -= 1;
+               }
+       }
+done:
+       if (verbose > 0)
+               printf("Found %d changes\n", changed);
+       return changed;
+}
+
+/*
+ * Emit a trie for the given tree into the data array.
+ */
+static void
+emit(struct tree *tree, unsigned char *data)
+{
+       struct node *node;
+       unsigned int leftmask;
+       unsigned int rightmask;
+       unsigned int bitmask;
+       int offlen;
+       int offset;
+       int index;
+       int indent;
+       unsigned char byte;
+
+       index = tree->index;
+       data += index;
+       indent = 1;
+       if (verbose > 0)
+               printf("Emitting %s_%x\n", tree->type, tree->maxage);
+       if (tree->childnode == LEAF) {
+               assert(tree->root);
+               tree->leaf_emit(tree->root, data);
+               return;
+       }
+
+       assert(tree->childnode == NODE);
+       node = tree->root;
+       leftmask = rightmask = 0;
+       while (node) {
+               if (!node->mark)
+                       goto skip;
+               assert(node->offset != -1);
+               assert(node->index == index);
+
+               byte = 0;
+               if (node->nextbyte)
+                       byte |= NEXTBYTE;
+               byte |= (node->bitnum & BITNUM);
+               if (node->left && node->right) {
+                       if (node->leftnode == NODE)
+                               byte |= LEFTNODE;
+                       if (node->rightnode == NODE)
+                               byte |= RIGHTNODE;
+                       if (node->offset <= 0xff)
+                               offlen = 1;
+                       else if (node->offset <= 0xffff)
+                               offlen = 2;
+                       else
+                               offlen = 3;
+                       offset = node->offset;
+                       byte |= offlen << OFFLEN_SHIFT;
+                       *data++ = byte;
+                       index++;
+                       while (offlen--) {
+                               *data++ = offset & 0xff;
+                               index++;
+                               offset >>= 8;
+                       }
+               } else if (node->left) {
+                       if (node->leftnode == NODE)
+                               byte |= TRIENODE;
+                       *data++ = byte;
+                       index++;
+               } else if (node->right) {
+                       byte |= RIGHTNODE;
+                       if (node->rightnode == NODE)
+                               byte |= TRIENODE;
+                       *data++ = byte;
+                       index++;
+               } else {
+                       assert(0);
+               }
+skip:
+               while (node) {
+                       bitmask = 1 << node->bitnum;
+                       if (node->mark && (leftmask & bitmask) == 0) {
+                               leftmask |= bitmask;
+                               if (node->leftnode == LEAF) {
+                                       assert(node->left);
+                                       data = tree->leaf_emit(node->left,
+                                                              data);
+                                       index += tree->leaf_size(node->left);
+                               } else if (node->left) {
+                                       assert(node->leftnode == NODE);
+                                       indent += 1;
+                                       node = node->left;
+                                       break;
+                               }
+                       }
+                       if (node->mark && (rightmask & bitmask) == 0) {
+                               rightmask |= bitmask;
+                               if (node->rightnode == LEAF) {
+                                       assert(node->right);
+                                       data = tree->leaf_emit(node->right,
+                                                              data);
+                                       index += tree->leaf_size(node->right);
+                               } else if (node->right) {
+                                       assert(node->rightnode==NODE);
+                                       indent += 1;
+                                       node = node->right;
+                                       break;
+                               }
+                       }
+                       leftmask &= ~bitmask;
+                       rightmask &= ~bitmask;
+                       node = node->parent;
+                       indent -= 1;
+               }
+       }
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Unicode data.
+ *
+ * We need to keep track of the Canonical Combining Class, the Age,
+ * and decompositions for a code point.
+ *
+ * For the Age, we store the index into the ages table.  Effectively
+ * this is a generation number that the table maps to a unicode
+ * version.
+ *
+ * The correction field is used to indicate that this entry is in the
+ * corrections array, which contains decompositions that were
+ * corrected in later revisions.  The value of the correction field is
+ * the Unicode version in which the mapping was corrected.
+ */
+struct unicode_data {
+       unsigned int code;
+       int ccc;
+       int gen;
+       int correction;
+       unsigned int *utf32nfkdi;
+       unsigned int *utf32nfkdicf;
+       char *utf8nfkdi;
+       char *utf8nfkdicf;
+};
+
+struct unicode_data unicode_data[0x110000];
+struct unicode_data *corrections;
+int    corrections_count;
+
+struct tree *nfkdi_tree;
+struct tree *nfkdicf_tree;
+
+struct tree *trees;
+int          trees_count;
+
+/*
+ * Check the corrections array to see if this entry was corrected at
+ * some point.
+ */
+static struct unicode_data *
+corrections_lookup(struct unicode_data *u)
+{
+       int i;
+
+       for (i = 0; i != corrections_count; i++)
+               if (u->code == corrections[i].code)
+                       return &corrections[i];
+       return u;
+}
+
+static int
+nfkdi_equal(void *l, void *r)
+{
+       struct unicode_data *left = l;
+       struct unicode_data *right = r;
+
+       if (left->gen != right->gen)
+               return 0;
+       if (left->ccc != right->ccc)
+               return 0;
+       if (left->utf8nfkdi && right->utf8nfkdi &&
+           strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
+               return 1;
+       if (left->utf8nfkdi || right->utf8nfkdi)
+               return 0;
+       return 1;
+}
+
+static int
+nfkdicf_equal(void *l, void *r)
+{
+       struct unicode_data *left = l;
+       struct unicode_data *right = r;
+
+       if (left->gen != right->gen)
+               return 0;
+       if (left->ccc != right->ccc)
+               return 0;
+       if (left->utf8nfkdicf && right->utf8nfkdicf &&
+           strcmp(left->utf8nfkdicf, right->utf8nfkdicf) == 0)
+               return 1;
+       if (left->utf8nfkdicf && right->utf8nfkdicf)
+               return 0;
+       if (left->utf8nfkdicf || right->utf8nfkdicf)
+               return 0;
+       if (left->utf8nfkdi && right->utf8nfkdi &&
+           strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
+               return 1;
+       if (left->utf8nfkdi || right->utf8nfkdi)
+               return 0;
+       return 1;
+}
+
+static void
+nfkdi_print(void *l, int indent)
+{
+       struct unicode_data *leaf = l;
+
+       printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
+               leaf->code, leaf->ccc, leaf->gen);
+       if (leaf->utf8nfkdi)
+               printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
+       printf("\n");
+}
+
+static void
+nfkdicf_print(void *l, int indent)
+{
+       struct unicode_data *leaf = l;
+
+       printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
+               leaf->code, leaf->ccc, leaf->gen);
+       if (leaf->utf8nfkdicf)
+               printf(" nfkdicf \"%s\"", (const char*)leaf->utf8nfkdicf);
+       else if (leaf->utf8nfkdi)
+               printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
+       printf("\n");
+}
+
+static int
+nfkdi_mark(void *l)
+{
+       return 1;
+}
+
+static int
+nfkdicf_mark(void *l)
+{
+       struct unicode_data *leaf = l;
+       if (leaf->utf8nfkdicf)
+               return 1;
+       return 0;
+}
+
+static int
+correction_mark(void *l)
+{
+       struct unicode_data *leaf = l;
+       return leaf->correction;
+}
+
+static int
+nfkdi_size(void *l)
+{
+       struct unicode_data *leaf = l;
+       int size = 2;
+       if (leaf->utf8nfkdi)
+               size += strlen(leaf->utf8nfkdi) + 1;
+       return size;
+}
+
+static int
+nfkdicf_size(void *l)
+{
+       struct unicode_data *leaf = l;
+       int size = 2;
+       if (leaf->utf8nfkdicf)
+               size += strlen(leaf->utf8nfkdicf) + 1;
+       else if (leaf->utf8nfkdi)
+               size += strlen(leaf->utf8nfkdi) + 1;
+       return size;
+}
+
+static int *
+nfkdi_index(struct tree *tree, void *l)
+{
+       struct unicode_data *leaf = l;
+       return &tree->leafindex[leaf->code];
+}
+
+static int *
+nfkdicf_index(struct tree *tree, void *l)
+{
+       struct unicode_data *leaf = l;
+       return &tree->leafindex[leaf->code];
+}
+
+static unsigned char *
+nfkdi_emit(void *l, unsigned char *data)
+{
+       struct unicode_data *leaf = l;
+       unsigned char *s;
+
+       *data++ = leaf->gen;
+       if (leaf->utf8nfkdi) {
+               *data++ = DECOMPOSE;
+               s = (unsigned char*)leaf->utf8nfkdi;
+               while ((*data++ = *s++) != 0)
+                       ;
+       } else {
+               *data++ = leaf->ccc;
+       }
+       return data;
+}
+
+static unsigned char *
+nfkdicf_emit(void *l, unsigned char *data)
+{
+       struct unicode_data *leaf = l;
+       unsigned char *s;
+
+       *data++ = leaf->gen;
+       if (leaf->utf8nfkdicf) {
+               *data++ = DECOMPOSE;
+               s = (unsigned char*)leaf->utf8nfkdicf;
+               while ((*data++ = *s++) != 0)
+                       ;
+       } else if (leaf->utf8nfkdi) {
+               *data++ = DECOMPOSE;
+               s = (unsigned char*)leaf->utf8nfkdi;
+               while ((*data++ = *s++) != 0)
+                       ;
+       } else {
+               *data++ = leaf->ccc;
+       }
+       return data;
+}
+
+static void
+utf8_create(struct unicode_data *data)
+{
+       char utf[18*4+1];
+       char *u;
+       unsigned int *um;
+       int i;
+
+       u = utf;
+       um = data->utf32nfkdi;
+       if (um) {
+               for (i = 0; um[i]; i++)
+                       u += utf8key(um[i], u);
+               *u = '\0';
+               data->utf8nfkdi = strdup((char*)utf);
+       }
+       u = utf;
+       um = data->utf32nfkdicf;
+       if (um) {
+               for (i = 0; um[i]; i++)
+                       u += utf8key(um[i], u);
+               *u = '\0';
+               if (!data->utf8nfkdi || strcmp(data->utf8nfkdi, (char*)utf))
+                       data->utf8nfkdicf = strdup((char*)utf);
+       }
+}
+
+static void
+utf8_init(void)
+{
+       unsigned int unichar;
+       int i;
+
+       for (unichar = 0; unichar != 0x110000; unichar++)
+               utf8_create(&unicode_data[unichar]);
+
+       for (i = 0; i != corrections_count; i++)
+               utf8_create(&corrections[i]);
+}
+
+static void
+trees_init(void)
+{
+       struct unicode_data *data;
+       unsigned int maxage;
+       unsigned int nextage;
+       int count;
+       int i;
+       int j;
+
+       /* Count the number of different ages. */
+       count = 0;
+       nextage = (unsigned int)-1;
+       do {
+               maxage = nextage;
+               nextage = 0;
+               for (i = 0; i <= corrections_count; i++) {
+                       data = &corrections[i];
+                       if (nextage < data->correction &&
+                           data->correction < maxage)
+                               nextage = data->correction;
+               }
+               count++;
+       } while (nextage);
+
+       /* Two trees per age: nfkdi and nfkdicf */
+       trees_count = count * 2;
+       trees = calloc(trees_count, sizeof(struct tree));
+
+       /* Assign ages to the trees. */
+       count = trees_count;
+       nextage = (unsigned int)-1;
+       do {
+               maxage = nextage;
+               trees[--count].maxage = maxage;
+               trees[--count].maxage = maxage;
+               nextage = 0;
+               for (i = 0; i <= corrections_count; i++) {
+                       data = &corrections[i];
+                       if (nextage < data->correction &&
+                           data->correction < maxage)
+                               nextage = data->correction;
+               }
+       } while (nextage);
+
+       /* The ages assigned above are off by one. */
+       for (i = 0; i != trees_count; i++) {
+               j = 0;
+               while (ages[j] < trees[i].maxage)
+                       j++;
+               trees[i].maxage = ages[j-1];
+       }
+
+       /* Set up the forwarding between trees. */
+       trees[trees_count-2].next = &trees[trees_count-1];
+       trees[trees_count-1].leaf_mark = nfkdi_mark;
+       trees[trees_count-2].leaf_mark = nfkdicf_mark;
+       for (i = 0; i != trees_count-2; i += 2) {
+               trees[i].next = &trees[trees_count-2];
+               trees[i].leaf_mark = correction_mark;
+               trees[i+1].next = &trees[trees_count-1];
+               trees[i+1].leaf_mark = correction_mark;
+       }
+
+       /* Assign the callouts. */
+       for (i = 0; i != trees_count; i += 2) {
+               trees[i].type = "nfkdicf";
+               trees[i].leaf_equal = nfkdicf_equal;
+               trees[i].leaf_print = nfkdicf_print;
+               trees[i].leaf_size = nfkdicf_size;
+               trees[i].leaf_index = nfkdicf_index;
+               trees[i].leaf_emit = nfkdicf_emit;
+
+               trees[i+1].type = "nfkdi";
+               trees[i+1].leaf_equal = nfkdi_equal;
+               trees[i+1].leaf_print = nfkdi_print;
+               trees[i+1].leaf_size = nfkdi_size;
+               trees[i+1].leaf_index = nfkdi_index;
+               trees[i+1].leaf_emit = nfkdi_emit;
+       }
+
+       /* Finish init. */
+       for (i = 0; i != trees_count; i++)
+               trees[i].childnode = NODE;
+}
+
+static void
+trees_populate(void)
+{
+       struct unicode_data *data;
+       unsigned int unichar;
+       char keyval[4];
+       int keylen;
+       int i;
+
+       for (i = 0; i != trees_count; i++) {
+               if (verbose > 0) {
+                       printf("Populating %s_%x\n",
+                               trees[i].type, trees[i].maxage);
+               }
+               for (unichar = 0; unichar != 0x110000; unichar++) {
+                       if (unicode_data[unichar].gen < 0)
+                               continue;
+                       keylen = utf8key(unichar, keyval);
+                       data = corrections_lookup(&unicode_data[unichar]);
+                       if (data->correction <= trees[i].maxage)
+                               data = &unicode_data[unichar];
+                       insert(&trees[i], keyval, keylen, data);
+               }
+       }
+}
+
+static void
+trees_reduce(void)
+{
+       int i;
+       int size;
+       int changed;
+
+       for (i = 0; i != trees_count; i++)
+               prune(&trees[i]);
+       for (i = 0; i != trees_count; i++)
+               mark_nodes(&trees[i]);
+       do {
+               size = 0;
+               for (i = 0; i != trees_count; i++)
+                       size = index_nodes(&trees[i], size);
+               changed = 0;
+               for (i = 0; i != trees_count; i++)
+                       changed += size_nodes(&trees[i]);
+       } while (changed);
+
+       utf8data = calloc(size, 1);
+       utf8data_size = size;
+       for (i = 0; i != trees_count; i++)
+               emit(&trees[i], utf8data);
+
+       if (verbose > 0) {
+               for (i = 0; i != trees_count; i++) {
+                       printf("%s_%x idx %d\n",
+                               trees[i].type, trees[i].maxage, trees[i].index);
+               }
+       }
+
+       nfkdi = utf8data + trees[trees_count-1].index;
+       nfkdicf = utf8data + trees[trees_count-2].index;
+
+       nfkdi_tree = &trees[trees_count-1];
+       nfkdicf_tree = &trees[trees_count-2];
+}
+
+static void
+verify(struct tree *tree)
+{
+       struct unicode_data *data;
+       utf8leaf_t      *leaf;
+       unsigned int    unichar;
+       char            key[4];
+       int             report;
+       int             nocf;
+
+       if (verbose > 0)
+               printf("Verifying %s_%x\n", tree->type, tree->maxage);
+       nocf = strcmp(tree->type, "nfkdicf");
+
+       for (unichar = 0; unichar != 0x110000; unichar++) {
+               report = 0;
+               data = corrections_lookup(&unicode_data[unichar]);
+               if (data->correction <= tree->maxage)
+                       data = &unicode_data[unichar];
+               utf8key(unichar, key);
+               leaf = utf8lookup(tree, key);
+               if (!leaf) {
+                       if (data->gen != -1)
+                               report++;
+                       if (unichar < 0xd800 || unichar > 0xdfff)
+                               report++;
+               } else {
+                       if (unichar >= 0xd800 && unichar <= 0xdfff)
+                               report++;
+                       if (data->gen == -1)
+                               report++;
+                       if (data->gen != LEAF_GEN(leaf))
+                               report++;
+                       if (LEAF_CCC(leaf) == DECOMPOSE) {
+                               if (nocf) {
+                                       if (!data->utf8nfkdi) {
+                                               report++;
+                                       } else if (strcmp(data->utf8nfkdi,
+                                                       LEAF_STR(leaf))) {
+                                               report++;
+                                       }
+                               } else {
+                                       if (!data->utf8nfkdicf &&
+                                           !data->utf8nfkdi) {
+                                               report++;
+                                       } else if (data->utf8nfkdicf) {
+                                               if (strcmp(data->utf8nfkdicf,
+                                                          LEAF_STR(leaf)))
+                                                       report++;
+                                       } else if (strcmp(data->utf8nfkdi,
+                                                         LEAF_STR(leaf))) {
+                                               report++;
+                                       }
+                               }
+                       } else if (data->ccc != LEAF_CCC(leaf)) {
+                               report++;
+                       }
+               }
+               if (report) {
+                       printf("%X code %X gen %d ccc %d"
+                               " nfdki -> \"%s\"",
+                               unichar, data->code, data->gen,
+                               data->ccc,
+                               data->utf8nfkdi);
+                       if (leaf) {
+                               printf(" age %d ccc %d"
+                                       " nfdki -> \"%s\"\n",
+                                       LEAF_GEN(leaf),
+                                       LEAF_CCC(leaf),
+                                       LEAF_CCC(leaf) == DECOMPOSE ?
+                                               LEAF_STR(leaf) : "");
+                       }
+                       printf("\n");
+               }
+       }
+}
+
+static void
+trees_verify(void)
+{
+       int i;
+
+       for (i = 0; i != trees_count; i++)
+               verify(&trees[i]);
+}
+
+/* ------------------------------------------------------------------ */
+
+static void
+help(void)
+{
+       printf("Usage: %s [options]\n", argv0);
+       printf("\n");
+       printf("This program creates an a data trie used for parsing and\n");
+       printf("normalization of UTF-8 strings. The trie is derived from\n");
+       printf("a set of input files from the Unicode character database\n");
+       printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n";);
+       printf("\n");
+       printf("The generated tree supports two normalization forms:\n");
+       printf("\n");
+       printf("\tnfkdi:\n");
+       printf("\t- Apply unicode normalization form NFKD.\n");
+       printf("\t- Remove any Default_Ignorable_Code_Point.\n");
+       printf("\n");
+       printf("\tnfkdicf:\n");
+       printf("\t- Apply unicode normalization form NFKD.\n");
+       printf("\t- Remove any Default_Ignorable_Code_Point.\n");
+       printf("\t- Apply a full casefold (C + F).\n");
+       printf("\n");
+       printf("These forms were chosen as being most useful when dealing\n");
+       printf("with file names: NFKD catches most cases where characters\n");
+       printf("should be considered equivalent. The ignorables are mostly\n");
+       printf("invisible, making names hard to type.\n");
+       printf("\n");
+       printf("The options to specify the files to be used are listed\n");
+       printf("below with their default values, which are the names used\n");
+       printf("by version 7.0.0 of the Unicode Character Database.\n");
+       printf("\n");
+       printf("The input files:\n");
+       printf("\t-a %s\n", AGE_NAME);
+       printf("\t-c %s\n", CCC_NAME);
+       printf("\t-p %s\n", PROP_NAME);
+       printf("\t-d %s\n", DATA_NAME);
+       printf("\t-f %s\n", FOLD_NAME);
+       printf("\t-n %s\n", NORM_NAME);
+       printf("\n");
+       printf("Additionally, the generated tables are tested using:\n");
+       printf("\t-t %s\n", TEST_NAME);
+       printf("\n");
+       printf("Finally, the output file:\n");
+       printf("\t-o %s\n", UTF8_NAME);
+       printf("\n");
+}
+
+static void
+usage(void)
+{
+       help();
+       exit(1);
+}
+
+static void
+open_fail(const char *name, int error)
+{
+       printf("Error %d opening %s: %s\n", error, name, strerror(error));
+       exit(1);
+}
+
+static void
+file_fail(const char *filename)
+{
+       printf("Error parsing %s\n", filename);
+       exit(1);
+}
+
+static void
+line_fail(const char *filename, const char *line)
+{
+       printf("Error parsing %s:%s\n", filename, line);
+       exit(1);
+}
+
+/* ------------------------------------------------------------------ */
+
+static void
+print_utf32(unsigned int *utf32str)
+{
+       int     i;
+       for (i = 0; utf32str[i]; i++)
+               printf(" %X", utf32str[i]);
+}
+
+static void
+print_utf32nfkdi(unsigned int unichar)
+{
+       printf(" %X ->", unichar);
+       print_utf32(unicode_data[unichar].utf32nfkdi);
+       printf("\n");
+}
+
+static void
+print_utf32nfkdicf(unsigned int unichar)
+{
+       printf(" %X ->", unichar);
+       print_utf32(unicode_data[unichar].utf32nfkdicf);
+       printf("\n");
+}
+
+/* ------------------------------------------------------------------ */
+
+static void
+age_init(void)
+{
+       FILE *file;
+       unsigned int first;
+       unsigned int last;
+       unsigned int unichar;
+       unsigned int major;
+       unsigned int minor;
+       unsigned int revision;
+       int gen;
+        int count;
+       int ret;
+
+       if (verbose > 0)
+               printf("Parsing %s\n", age_name);
+
+       file = fopen(age_name, "r");
+       if (!file)
+               open_fail(age_name, errno);
+        count = 0;
+
+        gen = 0;
+       while (fgets(line, LINESIZE, file)) {
+               ret = sscanf(line, "# Age=V%d_%d_%d",
+                               &major, &minor, &revision);
+               if (ret == 3) {
+                       ages_count++;
+                       if (verbose > 1)
+                               printf(" Age V%d_%d_%d\n",
+                                       major, minor, revision);
+                       if (!age_valid(major, minor, revision))
+                               line_fail(age_name, line);
+                       continue;
+               }
+               ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
+               if (ret == 2) {
+                       ages_count++;
+                       if (verbose > 1)
+                               printf(" Age V%d_%d\n", major, minor);
+                       if (!age_valid(major, minor, 0))
+                               line_fail(age_name, line);
+                       continue;
+               }
+       }
+
+       /* We must have found something above. */
+       if (verbose > 1)
+               printf("%d age entries\n", ages_count);
+       if (ages_count == 0 || ages_count > MAXGEN)
+               file_fail(age_name);
+
+       /* There is a 0 entry. */
+       ages_count++;
+       ages = calloc(ages_count + 1, sizeof(*ages));
+       /* And a guard entry. */
+       ages[ages_count] = (unsigned int)-1;
+
+       rewind(file);
+        count = 0;
+       gen = 0;
+       while (fgets(line, LINESIZE, file)) {
+               ret = sscanf(line, "# Age=V%d_%d_%d",
+                               &major, &minor, &revision);
+               if (ret == 3) {
+                       ages[++gen] =
+                               UNICODE_AGE(major, minor, revision);
+                       if (verbose > 1)
+                               printf(" Age V%d_%d_%d = gen %d\n",
+                                       major, minor, revision, gen);
+                       if (!age_valid(major, minor, revision))
+                               line_fail(age_name, line);
+                       continue;
+               }
+               ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
+               if (ret == 2) {
+                       ages[++gen] = UNICODE_AGE(major, minor, 0);
+                       if (verbose > 1)
+                               printf(" Age V%d_%d = %d\n",
+                                       major, minor, gen);
+                       if (!age_valid(major, minor, 0))
+                               line_fail(age_name, line);
+                       continue;
+               }
+               ret = sscanf(line, "%X..%X ; %d.%d #",
+                            &first, &last, &major, &minor);
+               if (ret == 4) {
+                       for (unichar = first; unichar <= last; unichar++)
+                               unicode_data[unichar].gen = gen;
+                        count += 1 + last - first;
+                       if (verbose > 1)
+                               printf("  %X..%X gen %d\n", first, last, gen);
+                       if (!utf32valid(first) || !utf32valid(last))
+                               line_fail(age_name, line);
+                       continue;
+               }
+               ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
+               if (ret == 3) {
+                       unicode_data[unichar].gen = gen;
+                        count++;
+                       if (verbose > 1)
+                               printf("  %X gen %d\n", unichar, gen);
+                       if (!utf32valid(unichar))
+                               line_fail(age_name, line);
+                       continue;
+               }
+       }
+       unicode_maxage = ages[gen];
+       fclose(file);
+
+       /* Nix surrogate block */
+       if (verbose > 1)
+               printf(" Removing surrogate block D800..DFFF\n");
+       for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
+               unicode_data[unichar].gen = -1;
+
+       if (verbose > 0)
+               printf("Found %d entries\n", count);
+       if (count == 0)
+               file_fail(age_name);
+}
+
+static void
+ccc_init(void)
+{
+       FILE *file;
+       unsigned int first;
+       unsigned int last;
+       unsigned int unichar;
+       unsigned int value;
+        int count;
+       int ret;
+
+       if (verbose > 0)
+               printf("Parsing %s\n", ccc_name);
+
+       file = fopen(ccc_name, "r");
+       if (!file)
+               open_fail(ccc_name, errno);
+
+        count = 0;
+       while (fgets(line, LINESIZE, file)) {
+               ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
+               if (ret == 3) {
+                       for (unichar = first; unichar <= last; unichar++) {
+                               unicode_data[unichar].ccc = value;
+                                count++;
+                       }
+                       if (verbose > 1)
+                               printf(" %X..%X ccc %d\n", first, last, value);
+                       if (!utf32valid(first) || !utf32valid(last))
+                               line_fail(ccc_name, line);
+                       continue;
+               }
+               ret = sscanf(line, "%X ; %d #", &unichar, &value);
+               if (ret == 2) {
+                       unicode_data[unichar].ccc = value;
+                        count++;
+                       if (verbose > 1)
+                               printf(" %X ccc %d\n", unichar, value);
+                       if (!utf32valid(unichar))
+                               line_fail(ccc_name, line);
+                       continue;
+               }
+       }
+       fclose(file);
+
+       if (verbose > 0)
+            printf("Found %d entries\n", count);
+       if (count == 0)
+               file_fail(ccc_name);
+}
+
+static void
+nfkdi_init(void)
+{
+       FILE *file;
+       unsigned int unichar;
+       unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+       char *s;
+       unsigned int *um;
+       int count;
+       int i;
+       int ret;
+
+       if (verbose > 0)
+               printf("Parsing %s\n", data_name);
+       file = fopen(data_name, "r");
+       if (!file)
+               open_fail(data_name, errno);
+
+       count = 0;
+       while (fgets(line, LINESIZE, file)) {
+               ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
+                            &unichar, buf0);
+               if (ret != 2)
+                       continue;
+               if (!utf32valid(unichar))
+                       line_fail(data_name, line);
+
+               s = buf0;
+               /* skip over <tag> */
+               if (*s == '<')
+                       while (*s++ != ' ')
+                               ;
+               /* decode the decomposition into UTF-32 */
+               i = 0;
+               while (*s) {
+                       mapping[i] = strtoul(s, &s, 16);
+                       if (!utf32valid(mapping[i]))
+                               line_fail(data_name, line);
+                       i++;
+               }
+               mapping[i++] = 0;
+
+               um = malloc(i * sizeof(unsigned int));
+               memcpy(um, mapping, i * sizeof(unsigned int));
+               unicode_data[unichar].utf32nfkdi = um;
+
+               if (verbose > 1)
+                       print_utf32nfkdi(unichar);
+               count++;
+       }
+       fclose(file);
+       if (verbose > 0)
+               printf("Found %d entries\n", count);
+       if (count == 0)
+               file_fail(data_name);
+}
+
+static void
+nfkdicf_init(void)
+{
+       FILE *file;
+       unsigned int unichar;
+       unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+       char status;
+       char *s;
+       unsigned int *um;
+       int i;
+       int count;
+       int ret;
+
+       if (verbose > 0)
+               printf("Parsing %s\n", fold_name);
+       file = fopen(fold_name, "r");
+       if (!file)
+               open_fail(fold_name, errno);
+
+       count = 0;
+       while (fgets(line, LINESIZE, file)) {
+               ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
+               if (ret != 3)
+                       continue;
+               if (!utf32valid(unichar))
+                       line_fail(fold_name, line);
+               /* Use the C+F casefold. */
+               if (status != 'C' && status != 'F')
+                       continue;
+               s = buf0;
+               if (*s == '<')
+                       while (*s++ != ' ')
+                               ;
+               i = 0;
+               while (*s) {
+                       mapping[i] = strtoul(s, &s, 16);
+                       if (!utf32valid(mapping[i]))
+                               line_fail(fold_name, line);
+                       i++;
+               }
+               mapping[i++] = 0;
+
+               um = malloc(i * sizeof(unsigned int));
+               memcpy(um, mapping, i * sizeof(unsigned int));
+               unicode_data[unichar].utf32nfkdicf = um;
+
+               if (verbose > 1)
+                       print_utf32nfkdicf(unichar);
+               count++;
+       }
+       fclose(file);
+       if (verbose > 0)
+               printf("Found %d entries\n", count);
+       if (count == 0)
+               file_fail(fold_name);
+}
+
+static void
+ignore_init(void)
+{
+       FILE *file;
+       unsigned int unichar;
+       unsigned int first;
+       unsigned int last;
+       unsigned int *um;
+       int count;
+       int ret;
+
+       if (verbose > 0)
+               printf("Parsing %s\n", prop_name);
+       file = fopen(prop_name, "r");
+       if (!file)
+               open_fail(prop_name, errno);
+       assert(file);
+       count = 0;
+       while (fgets(line, LINESIZE, file)) {
+               ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
+               if (ret == 3) {
+                       if (strcmp(buf0, "Default_Ignorable_Code_Point"))
+                               continue;
+                       if (!utf32valid(first) || !utf32valid(last))
+                               line_fail(prop_name, line);
+                       for (unichar = first; unichar <= last; unichar++) {
+                               free(unicode_data[unichar].utf32nfkdi);
+                               um = malloc(sizeof(unsigned int));
+                               *um = 0;
+                               unicode_data[unichar].utf32nfkdi = um;
+                               free(unicode_data[unichar].utf32nfkdicf);
+                               um = malloc(sizeof(unsigned int));
+                               *um = 0;
+                               unicode_data[unichar].utf32nfkdicf = um;
+                               count++;
+                       }
+                       if (verbose > 1)
+                               printf(" %X..%X Default_Ignorable_Code_Point\n",
+                                       first, last);
+                       continue;
+               }
+               ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
+               if (ret == 2) {
+                       if (strcmp(buf0, "Default_Ignorable_Code_Point"))
+                               continue;
+                       if (!utf32valid(unichar))
+                               line_fail(prop_name, line);
+                       free(unicode_data[unichar].utf32nfkdi);
+                       um = malloc(sizeof(unsigned int));
+                       *um = 0;
+                       unicode_data[unichar].utf32nfkdi = um;
+                       free(unicode_data[unichar].utf32nfkdicf);
+                       um = malloc(sizeof(unsigned int));
+                       *um = 0;
+                       unicode_data[unichar].utf32nfkdicf = um;
+                       if (verbose > 1)
+                               printf(" %X Default_Ignorable_Code_Point\n",
+                                       unichar);
+                       count++;
+                       continue;
+               }
+       }
+       fclose(file);
+
+       if (verbose > 0)
+               printf("Found %d entries\n", count);
+       if (count == 0)
+               file_fail(prop_name);
+}
+
+static void
+corrections_init(void)
+{
+       FILE *file;
+       unsigned int unichar;
+       unsigned int major;
+       unsigned int minor;
+       unsigned int revision;
+       unsigned int age;
+       unsigned int *um;
+       unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+       char *s;
+       int i;
+       int count;
+       int ret;
+
+       if (verbose > 0)
+               printf("Parsing %s\n", norm_name);
+       file = fopen(norm_name, "r");
+       if (!file)
+               open_fail(norm_name, errno);
+
+       count = 0;
+       while (fgets(line, LINESIZE, file)) {
+               ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
+                               &unichar, buf0, buf1,
+                               &major, &minor, &revision);
+               if (ret != 6)
+                       continue;
+               if (!utf32valid(unichar) || !age_valid(major, minor, revision))
+                       line_fail(norm_name, line);
+               count++;
+       }
+       corrections = calloc(count, sizeof(struct unicode_data));
+       corrections_count = count;
+       rewind(file);
+
+       count = 0;
+       while (fgets(line, LINESIZE, file)) {
+               ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
+                               &unichar, buf0, buf1,
+                               &major, &minor, &revision);
+               if (ret != 6)
+                       continue;
+               if (!utf32valid(unichar) || !age_valid(major, minor, revision))
+                       line_fail(norm_name, line);
+               corrections[count] = unicode_data[unichar];
+               assert(corrections[count].code == unichar);
+               age = UNICODE_AGE(major, minor, revision);
+               corrections[count].correction = age;
+
+               i = 0;
+               s = buf0;
+               while (*s) {
+                       mapping[i] = strtoul(s, &s, 16);
+                       if (!utf32valid(mapping[i]))
+                               line_fail(norm_name, line);
+                       i++;
+               }
+               mapping[i++] = 0;
+
+               um = malloc(i * sizeof(unsigned int));
+               memcpy(um, mapping, i * sizeof(unsigned int));
+               corrections[count].utf32nfkdi = um;
+
+               if (verbose > 1)
+                       printf(" %X -> %s -> %s V%d_%d_%d\n",
+                               unichar, buf0, buf1, major, minor, revision);
+               count++;
+       }
+       fclose(file);
+
+       if (verbose > 0)
+               printf("Found %d entries\n", count);
+       if (count == 0)
+               file_fail(norm_name);
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
+ *
+ * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+ * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+ *
+ * SBase = 0xAC00
+ * LBase = 0x1100
+ * VBase = 0x1161
+ * TBase = 0x11A7
+ * LCount = 19
+ * VCount = 21
+ * TCount = 28
+ * NCount = 588 (VCount * TCount)
+ * SCount = 11172 (LCount * NCount)
+ *
+ * Decomposition:
+ *   SIndex = s - SBase
+ *
+ * LV (Canonical/Full)
+ *   LIndex = SIndex / NCount
+ *   VIndex = (Sindex % NCount) / TCount
+ *   LPart = LBase + LIndex
+ *   VPart = VBase + VIndex
+ *
+ * LVT (Canonical)
+ *   LVIndex = (SIndex / TCount) * TCount
+ *   TIndex = (Sindex % TCount
+ *   LVPart = LBase + LVIndex
+ *   TPart = TBase + TIndex
+ *
+ * LVT (Full)
+ *   LIndex = SIndex / NCount
+ *   VIndex = (Sindex % NCount) / TCount
+ *   TIndex = (Sindex % TCount
+ *   LPart = LBase + LIndex
+ *   VPart = VBase + VIndex
+ *   if (TIndex == 0) {
+ *          d = <LPart, VPart>
+ *   } else {
+ *          TPart = TBase + TIndex
+ *          d = <LPart, TPart, VPart>
+ *   }
+ *
+ */
+
+static void
+hangul_decompose(void)
+{
+       unsigned int sb = 0xAC00;
+       unsigned int lb = 0x1100;
+       unsigned int vb = 0x1161;
+       unsigned int tb = 0x11a7;
+       /* unsigned int lc = 19; */
+       unsigned int vc = 21;
+       unsigned int tc = 28;
+       unsigned int nc = (vc * tc);
+       /* unsigned int sc = (lc * nc); */
+       unsigned int unichar;
+       unsigned int mapping[4];
+       unsigned int *um;
+        int count;
+       int i;
+
+       if (verbose > 0)
+               printf("Decomposing hangul\n");
+       /* Hangul */
+       count = 0;
+       for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
+               unsigned int si = unichar - sb;
+               unsigned int li = si / nc;
+               unsigned int vi = (si % nc) / tc;
+               unsigned int ti = si % tc;
+
+               i = 0;
+               mapping[i++] = lb + li;
+               mapping[i++] = vb + vi;
+               if (ti)
+                       mapping[i++] = tb + ti;
+               mapping[i++] = 0;
+
+               assert(!unicode_data[unichar].utf32nfkdi);
+               um = malloc(i * sizeof(unsigned int));
+               memcpy(um, mapping, i * sizeof(unsigned int));
+               unicode_data[unichar].utf32nfkdi = um;
+
+               assert(!unicode_data[unichar].utf32nfkdicf);
+               um = malloc(i * sizeof(unsigned int));
+               memcpy(um, mapping, i * sizeof(unsigned int));
+               unicode_data[unichar].utf32nfkdicf = um;
+
+               if (verbose > 1)
+                       print_utf32nfkdi(unichar);
+
+               count++;
+       }
+       if (verbose > 0)
+               printf("Created %d entries\n", count);
+}
+
+static void
+nfkdi_decompose(void)
+{
+       unsigned int unichar;
+       unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+       unsigned int *um;
+       unsigned int *dc;
+       int count;
+       int i;
+       int j;
+       int ret;
+
+       if (verbose > 0)
+               printf("Decomposing nfkdi\n");
+
+       count = 0;
+       for (unichar = 0; unichar != 0x110000; unichar++) {
+               if (!unicode_data[unichar].utf32nfkdi)
+                       continue;
+               for (;;) {
+                       ret = 1;
+                       i = 0;
+                       um = unicode_data[unichar].utf32nfkdi;
+                       while (*um) {
+                               dc = unicode_data[*um].utf32nfkdi;
+                               if (dc) {
+                                       for (j = 0; dc[j]; j++)
+                                               mapping[i++] = dc[j];
+                                       ret = 0;
+                               } else {
+                                       mapping[i++] = *um;
+                               }
+                               um++;
+                       }
+                       mapping[i++] = 0;
+                       if (ret)
+                               break;
+                       free(unicode_data[unichar].utf32nfkdi);
+                       um = malloc(i * sizeof(unsigned int));
+                       memcpy(um, mapping, i * sizeof(unsigned int));
+                       unicode_data[unichar].utf32nfkdi = um;
+               }
+               /* Add this decomposition to nfkdicf if there is no entry. */
+               if (!unicode_data[unichar].utf32nfkdicf) {
+                       um = malloc(i * sizeof(unsigned int));
+                       memcpy(um, mapping, i * sizeof(unsigned int));
+                       unicode_data[unichar].utf32nfkdicf = um;
+               }
+               if (verbose > 1)
+                       print_utf32nfkdi(unichar);
+               count++;
+       }
+       if (verbose > 0)
+               printf("Processed %d entries\n", count);
+}
+
+static void
+nfkdicf_decompose(void)
+{
+       unsigned int unichar;
+       unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+       unsigned int *um;
+       unsigned int *dc;
+       int count;
+       int i;
+       int j;
+       int ret;
+
+       if (verbose > 0)
+               printf("Decomposing nfkdicf\n");
+       count = 0;
+       for (unichar = 0; unichar != 0x110000; unichar++) {
+               if (!unicode_data[unichar].utf32nfkdicf)
+                       continue;
+               for (;;) {
+                       ret = 1;
+                       i = 0;
+                       um = unicode_data[unichar].utf32nfkdicf;
+                       while (*um) {
+                               dc = unicode_data[*um].utf32nfkdicf;
+                               if (dc) {
+                                       for (j = 0; dc[j]; j++)
+                                               mapping[i++] = dc[j];
+                                       ret = 0;
+                               } else {
+                                       mapping[i++] = *um;
+                               }
+                               um++;
+                       }
+                       mapping[i++] = 0;
+                       if (ret)
+                               break;
+                       free(unicode_data[unichar].utf32nfkdicf);
+                       um = malloc(i * sizeof(unsigned int));
+                       memcpy(um, mapping, i * sizeof(unsigned int));
+                       unicode_data[unichar].utf32nfkdicf = um;
+               }
+               if (verbose > 1)
+                       print_utf32nfkdicf(unichar);
+               count++;
+       }
+       if (verbose > 0)
+               printf("Processed %d entries\n", count);
+}
+
+/* ------------------------------------------------------------------ */
+
+int utf8agemax(struct tree *, const char *);
+int utf8nagemax(struct tree *, const char *, size_t);
+int utf8agemin(struct tree *, const char *);
+int utf8nagemin(struct tree *, const char *, size_t);
+ssize_t utf8len(struct tree *, const char *);
+ssize_t utf8nlen(struct tree *, const char *, size_t);
+struct utf8cursor;
+int utf8cursor(struct utf8cursor *, struct tree *, const char *);
+int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
+int utf8byte(struct utf8cursor *);
+
+/*
+ * Use trie to scan s, touching at most len bytes.
+ * Returns the leaf if one exists, NULL otherwise.
+ *
+ * A non-NULL return guarantees that the UTF-8 sequence starting at s
+ * is well-formed and corresponds to a known unicode code point.  The
+ * shorthand for this will be "is valid UTF-8 unicode".
+ */
+static utf8leaf_t *
+utf8nlookup(struct tree *tree, const char *s, size_t len)
+{
+       utf8trie_t      *trie = utf8data + tree->index;
+       int             offlen;
+       int             offset;
+       int             mask;
+       int             node;
+
+       if (!tree)
+               return NULL;
+       if (len == 0)
+               return NULL;
+       node = 1;
+       while (node) {
+               offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
+               if (*trie & NEXTBYTE) {
+                       if (--len == 0)
+                               return NULL;
+                       s++;
+               }
+               mask = 1 << (*trie & BITNUM);
+               if (*s & mask) {
+                       /* Right leg */
+                       if (offlen) {
+                               /* Right node at offset of trie */
+                               node = (*trie & RIGHTNODE);
+                               offset = trie[offlen];
+                               while (--offlen) {
+                                       offset <<= 8;
+                                       offset |= trie[offlen];
+                               }
+                               trie += offset;
+                       } else if (*trie & RIGHTPATH) {
+                               /* Right node after this node */
+                               node = (*trie & TRIENODE);
+                               trie++;
+                       } else {
+                               /* No right node. */
+                               node = 0;
+                               trie = NULL;
+                       }
+               } else {
+                       /* Left leg */
+                       if (offlen) {
+                               /* Left node after this node. */
+                               node = (*trie & LEFTNODE);
+                               trie += offlen + 1;
+                       } else if (*trie & RIGHTPATH) {
+                               /* No left node. */
+                               node = 0;
+                               trie = NULL;
+                       } else {
+                               /* Left node after this node */
+                               node = (*trie & TRIENODE);
+                               trie++;
+                       }
+               }
+       }
+       return trie;
+}
+
+/*
+ * Use trie to scan s.
+ * Returns the leaf if one exists, NULL otherwise.
+ *
+ * Forwards to trie_nlookup().
+ */
+static utf8leaf_t *
+utf8lookup(struct tree *tree, const char *s)
+{
+       return utf8nlookup(tree, s, (size_t)-1);
+}
+
+/*
+ * Return the number of bytes used by the current UTF-8 sequence.
+ * Assumes the input points to the first byte of a valid UTF-8
+ * sequence.
+ */
+static inline int
+utf8clen(const char *s)
+{
+       unsigned char c = *s;
+       return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
+}
+
+/*
+ * Maximum age of any character in s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ * Return 0 if only non-assigned code points are used.
+ */
+int
+utf8agemax(struct tree *tree, const char *s)
+{
+       utf8leaf_t      *leaf;
+       int             age = 0;
+       int             leaf_age;
+
+       if (!tree)
+               return -1;
+       while (*s) {
+               if (!(leaf = utf8lookup(tree, s)))
+                       return -1;
+               leaf_age = ages[LEAF_GEN(leaf)];
+               if (leaf_age <= tree->maxage && leaf_age > age)
+                       age = leaf_age;
+               s += utf8clen(s);
+       }
+       return age;
+}
+
+/*
+ * Minimum age of any character in s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ * Return 0 if non-assigned code points are used.
+ */
+int
+utf8agemin(struct tree *tree, const char *s)
+{
+       utf8leaf_t      *leaf;
+       int             age = tree->maxage;
+       int             leaf_age;
+
+       if (!tree)
+               return -1;
+       while (*s) {
+               if (!(leaf = utf8lookup(tree, s)))
+                       return -1;
+               leaf_age = ages[LEAF_GEN(leaf)];
+               if (leaf_age <= tree->maxage && leaf_age < age)
+                       age = leaf_age;
+               s += utf8clen(s);
+       }
+       return age;
+}
+
+/*
+ * Maximum age of any character in s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+int
+utf8nagemax(struct tree *tree, const char *s, size_t len)
+{
+       utf8leaf_t      *leaf;
+       int             age = 0;
+       int             leaf_age;
+
+       if (!tree)
+               return -1;
+        while (len && *s) {
+               if (!(leaf = utf8nlookup(tree, s, len)))
+                       return -1;
+               leaf_age = ages[LEAF_GEN(leaf)];
+               if (leaf_age <= tree->maxage && leaf_age > age)
+                       age = leaf_age;
+               len -= utf8clen(s);
+               s += utf8clen(s);
+       }
+       return age;
+}
+
+/*
+ * Maximum age of any character in s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+int
+utf8nagemin(struct tree *tree, const char *s, size_t len)
+{
+       utf8leaf_t      *leaf;
+       int             leaf_age;
+       int             age = tree->maxage;
+
+       if (!tree)
+               return -1;
+        while (len && *s) {
+               if (!(leaf = utf8nlookup(tree, s, len)))
+                       return -1;
+               leaf_age = ages[LEAF_GEN(leaf)];
+               if (leaf_age <= tree->maxage && leaf_age < age)
+                       age = leaf_age;
+               len -= utf8clen(s);
+               s += utf8clen(s);
+       }
+       return age;
+}
+
+/*
+ * Length of the normalization of s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ *
+ * A string of Default_Ignorable_Code_Point has length 0.
+ */
+ssize_t
+utf8len(struct tree *tree, const char *s)
+{
+       utf8leaf_t      *leaf;
+       size_t          ret = 0;
+
+       if (!tree)
+               return -1;
+       while (*s) {
+               if (!(leaf = utf8lookup(tree, s)))
+                       return -1;
+               if (ages[LEAF_GEN(leaf)] > tree->maxage)
+                       ret += utf8clen(s);
+               else if (LEAF_CCC(leaf) == DECOMPOSE)
+                       ret += strlen(LEAF_STR(leaf));
+               else
+                       ret += utf8clen(s);
+               s += utf8clen(s);
+       }
+       return ret;
+}
+
+/*
+ * Length of the normalization of s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+ssize_t
+utf8nlen(struct tree *tree, const char *s, size_t len)
+{
+       utf8leaf_t      *leaf;
+       size_t          ret = 0;
+
+       if (!tree)
+               return -1;
+       while (len && *s) {
+               if (!(leaf = utf8nlookup(tree, s, len)))
+                       return -1;
+               if (ages[LEAF_GEN(leaf)] > tree->maxage)
+                       ret += utf8clen(s);
+               else if (LEAF_CCC(leaf) == DECOMPOSE)
+                       ret += strlen(LEAF_STR(leaf));
+               else
+                       ret += utf8clen(s);
+               len -= utf8clen(s);
+               s += utf8clen(s);
+       }
+       return ret;
+}
+
+/*
+ * Cursor structure used by the normalizer.
+ */
+struct utf8cursor {
+       struct tree     *tree;
+       const char      *s;
+       const char      *p;
+       const char      *ss;
+       const char      *sp;
+       unsigned int    len;
+       unsigned int    slen;
+       short int       ccc;
+       short int       nccc;
+       unsigned int    unichar;
+};
+
+/*
+ * Set up an utf8cursor for use by utf8byte().
+ *
+ *   s      : string.
+ *   len    : length of s.
+ *   u8c    : pointer to cursor.
+ *   trie   : utf8trie_t to use for normalization.
+ *
+ * Returns -1 on error, 0 on success.
+ */
+int
+utf8ncursor(
+       struct utf8cursor *u8c,
+       struct tree     *tree,
+       const char      *s,
+       size_t          len)
+{
+       if (!tree)
+               return -1;
+       if (!s)
+               return -1;
+       u8c->tree = tree;
+       u8c->s = s;
+       u8c->p = NULL;
+       u8c->ss = NULL;
+       u8c->sp = NULL;
+       u8c->len = len;
+       u8c->slen = 0;
+       u8c->ccc = STOPPER;
+       u8c->nccc = STOPPER;
+       u8c->unichar = 0;
+       /* Check we didn't clobber the maximum length. */
+       if (u8c->len != len)
+               return -1;
+       /* The first byte of s may not be an utf8 continuation. */
+       if (len > 0 && (*s & 0xC0) == 0x80)
+               return -1;
+       return 0;
+}
+
+/*
+ * Set up an utf8cursor for use by utf8byte().
+ *
+ *   s      : NUL-terminated string.
+ *   u8c    : pointer to cursor.
+ *   trie   : utf8trie_t to use for normalization.
+ *
+ * Returns -1 on error, 0 on success.
+ */
+int
+utf8cursor(
+       struct utf8cursor *u8c,
+       struct tree     *tree,
+       const char      *s)
+{
+       return utf8ncursor(u8c, tree, s, (unsigned int)-1);
+}
+
+/*
+ * Get one byte from the normalized form of the string described by u8c.
+ *
+ * Returns the byte cast to an unsigned char on succes, and -1 on failure.
+ *
+ * The cursor keeps track of the location in the string in u8c->s.
+ * When a character is decomposed, the current location is stored in
+ * u8c->p, and u8c->s is set to the start of the decomposition. Note
+ * that bytes from a decomposition do not count against u8c->len.
+ *
+ * Characters are emitted if they match the current CCC in u8c->ccc.
+ * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
+ * and the function returns 0 in that case.
+ *
+ * Sorting by CCC is done by repeatedly scanning the string.  The
+ * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
+ * the start of the scan.  The first pass finds the lowest CCC to be
+ * emitted and stores it in u8c->nccc, the second pass emits the
+ * characters with this CCC and finds the next lowest CCC. This limits
+ * the number of passes to 1 + the number of different CCCs in the
+ * sequence being scanned.
+ *
+ * Therefore:
+ *  u8c->p  != NULL -> a decomposition is being scanned.
+ *  u8c->ss != NULL -> this is a repeating scan.
+ *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
+ */
+int
+utf8byte(struct utf8cursor *u8c)
+{
+       utf8leaf_t *leaf;
+       int ccc;
+
+       for (;;) {
+               /* Check for the end of a decomposed character. */
+               if (u8c->p && *u8c->s == '\0') {
+                       u8c->s = u8c->p;
+                       u8c->p = NULL;
+               }
+
+               /* Check for end-of-string. */
+               if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
+                       /* There is no next byte. */
+                       if (u8c->ccc == STOPPER)
+                               return 0;
+                       /* End-of-string during a scan counts as a stopper. */
+                       ccc = STOPPER;
+                       goto ccc_mismatch;
+               } else if ((*u8c->s & 0xC0) == 0x80) {
+                       /* This is a continuation of the current character. */
+                       if (!u8c->p)
+                               u8c->len--;
+                       return (unsigned char)*u8c->s++;
+               }
+
+               /* Look up the data for the current character. */
+               if (u8c->p)
+                       leaf = utf8lookup(u8c->tree, u8c->s);
+               else
+                       leaf = utf8nlookup(u8c->tree, u8c->s, u8c->len);
+
+               /* No leaf found implies that the input is a binary blob. */
+               if (!leaf)
+                       return -1;
+
+               /* Characters that are too new have CCC 0. */
+               if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
+                       ccc = STOPPER;
+               } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
+                       u8c->len -= utf8clen(u8c->s);
+                       u8c->p = u8c->s + utf8clen(u8c->s);
+                       u8c->s = LEAF_STR(leaf);
+                       /* Empty decomposition implies CCC 0. */
+                       if (*u8c->s == '\0') {
+                               if (u8c->ccc == STOPPER)
+                                       continue;
+                               ccc = STOPPER;
+                               goto ccc_mismatch;
+                       }
+                       leaf = utf8lookup(u8c->tree, u8c->s);
+                       ccc = LEAF_CCC(leaf);
+               }
+               u8c->unichar = utf8code(u8c->s);
+
+               /*
+                * If this is not a stopper, then see if it updates
+                * the next canonical class to be emitted.
+                */
+               if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
+                       u8c->nccc = ccc;
+
+               /*
+                * Return the current byte if this is the current
+                * combining class.
+                */
+               if (ccc == u8c->ccc) {
+                       if (!u8c->p)
+                               u8c->len--;
+                       return (unsigned char)*u8c->s++;
+               }
+
+               /* Current combining class mismatch. */
+       ccc_mismatch:
+               if (u8c->nccc == STOPPER) {
+                       /*
+                        * Scan forward for the first canonical class
+                        * to be emitted.  Save the position from
+                        * which to restart.
+                        */
+                       assert(u8c->ccc == STOPPER);
+                       u8c->ccc = MINCCC - 1;
+                       u8c->nccc = ccc;
+                       u8c->sp = u8c->p;
+                       u8c->ss = u8c->s;
+                       u8c->slen = u8c->len;
+                       if (!u8c->p)
+                               u8c->len -= utf8clen(u8c->s);
+                       u8c->s += utf8clen(u8c->s);
+               } else if (ccc != STOPPER) {
+                       /* Not a stopper, and not the ccc we're emitting. */
+                       if (!u8c->p)
+                               u8c->len -= utf8clen(u8c->s);
+                       u8c->s += utf8clen(u8c->s);
+               } else if (u8c->nccc != MAXCCC + 1) {
+                       /* At a stopper, restart for next ccc. */
+                       u8c->ccc = u8c->nccc;
+                       u8c->nccc = MAXCCC + 1;
+                       u8c->s = u8c->ss;
+                       u8c->p = u8c->sp;
+                       u8c->len = u8c->slen;
+               } else {
+                       /* All done, proceed from here. */
+                       u8c->ccc = STOPPER;
+                       u8c->nccc = STOPPER;
+                       u8c->sp = NULL;
+                       u8c->ss = NULL;
+                       u8c->slen = 0;
+               }
+       }
+}
+
+/* ------------------------------------------------------------------ */
+
+static int
+normalize_line(struct tree *tree)
+{
+       char *s;
+       char *t;
+       int c;
+       struct utf8cursor u8c;
+
+       /* First test: null-terminated string. */
+       s = buf2;
+       t = buf3;
+       if (utf8cursor(&u8c, tree, s))
+               return -1;
+       while ((c = utf8byte(&u8c)) > 0)
+               if (c != (unsigned char)*t++)
+                       return -1;
+       if (c < 0)
+               return -1;
+       if (*t != 0)
+               return -1;
+
+       /* Second test: length-limited string. */
+       s = buf2;
+       /* Replace NUL with a value that will cause an error if seen. */
+       s[strlen(s) + 1] = -1;
+       t = buf3;
+       if (utf8cursor(&u8c, tree, s))
+               return -1;
+       while ((c = utf8byte(&u8c)) > 0)
+               if (c != (unsigned char)*t++)
+                       return -1;
+       if (c < 0)
+               return -1;
+       if (*t != 0)
+               return -1;
+
+       return 0;
+}
+
+static void
+normalization_test(void)
+{
+       FILE *file;
+       unsigned int unichar;
+       struct unicode_data *data;
+       char *s;
+       char *t;
+       int ret;
+       int ignorables;
+       int tests = 0;
+       int failures = 0;
+
+       if (verbose > 0)
+               printf("Parsing %s\n", test_name);
+       /* Step one, read data from file. */
+       file = fopen(test_name, "r");
+       if (!file)
+               open_fail(test_name, errno);
+
+       while (fgets(line, LINESIZE, file)) {
+               ret = sscanf(line, "%[^;];%*[^;];%*[^;];%*[^;];%[^;];",
+                            buf0, buf1);
+               if (ret != 2 || *line == '#')
+                       continue;
+               s = buf0;
+               t = buf2;
+               while (*s) {
+                       unichar = strtoul(s, &s, 16);
+                       t += utf8key(unichar, t);
+               }
+               *t = '\0';
+
+               ignorables = 0;
+               s = buf1;
+               t = buf3;
+               while (*s) {
+                       unichar = strtoul(s, &s, 16);
+                       data = &unicode_data[unichar];
+                       if (data->utf8nfkdi && !*data->utf8nfkdi)
+                               ignorables = 1;
+                       else
+                               t += utf8key(unichar, t);
+               }
+               *t = '\0';
+
+               tests++;
+               if (normalize_line(nfkdi_tree) < 0) {
+                       printf("\nline %s -> %s", buf0, buf1);
+                       if (ignorables)
+                               printf(" (ignorables removed)");
+                       printf(" failure\n");
+                       failures++;
+               }
+       }
+       fclose(file);
+       if (verbose > 0)
+               printf("Ran %d tests with %d failures\n", tests, failures);
+       if (failures)
+               file_fail(test_name);
+}
+
+/* ------------------------------------------------------------------ */
+
+static void
+write_file(void)
+{
+       FILE *file;
+       int i;
+       int j;
+       int t;
+       int gen;
+
+       if (verbose > 0)
+               printf("Writing %s\n", utf8_name);
+       file = fopen(utf8_name, "w");
+       if (!file)
+               open_fail(utf8_name, errno);
+
+       fprintf(file, "/* This file is generated code, do not edit. */\n");
+       fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
+       fprintf(file, "#error Only xfs_utf8.c may include this file.\n");
+       fprintf(file, "#endif\n");
+       fprintf(file, "\n");
+       fprintf(file, "const unsigned int utf8version = %#x;\n",
+               unicode_maxage);
+       fprintf(file, "\n");
+       fprintf(file, "static const unsigned int utf8agetab[] = {\n");
+       for (i = 0; i != ages_count; i++)
+               fprintf(file, "\t%#x%s\n", ages[i],
+                       ages[i] == unicode_maxage ? "" : ",");
+       fprintf(file, "};\n");
+       fprintf(file, "\n");
+       fprintf(file, "static const struct utf8data utf8nfkdicfdata[] = {\n");
+       t = 0;
+       for (gen = 0; gen < ages_count; gen++) {
+               fprintf(file, "\t{ %#x, %d }%s\n",
+                       ages[gen], trees[t].index,
+                       ages[gen] == unicode_maxage ? "" : ",");
+               if (trees[t].maxage == ages[gen])
+                       t += 2;
+       }
+       fprintf(file, "};\n");
+       fprintf(file, "\n");
+       fprintf(file, "static const struct utf8data utf8nfkdidata[] = {\n");
+       t = 1;
+       for (gen = 0; gen < ages_count; gen++) {
+               fprintf(file, "\t{ %#x, %d }%s\n",
+                       ages[gen], trees[t].index,
+                       ages[gen] == unicode_maxage ? "" : ",");
+               if (trees[t].maxage == ages[gen])
+                       t += 2;
+       }
+       fprintf(file, "};\n");
+       fprintf(file, "\n");
+       fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
+               utf8data_size);
+       t = 0;
+       for (i = 0; i != utf8data_size; i += 16) {
+               if (i == trees[t].index) {
+                       fprintf(file, "\t/* %s_%x */\n",
+                               trees[t].type, trees[t].maxage);
+                       if (t < trees_count-1)
+                               t++;
+               }
+               fprintf(file, "\t");
+               for (j = i; j != i + 16; j++)
+                       fprintf(file, "0x%.2x%s", utf8data[j],
+                               (j < utf8data_size -1 ? "," : ""));
+               fprintf(file, "\n");
+       }
+       fprintf(file, "};\n");
+       fclose(file);
+}
+
+/* ------------------------------------------------------------------ */
+
+int
+main(int argc, char *argv[])
+{
+       unsigned int unichar;
+       int opt;
+
+       argv0 = argv[0];
+
+       while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
+               switch (opt) {
+               case 'a':
+                       age_name = optarg;
+                       break;
+               case 'c':
+                       ccc_name = optarg;
+                       break;
+               case 'd':
+                       data_name = optarg;
+                       break;
+               case 'f':
+                       fold_name = optarg;
+                       break;
+               case 'n':
+                       norm_name = optarg;
+                       break;
+               case 'o':
+                       utf8_name = optarg;
+                       break;
+               case 'p':
+                       prop_name = optarg;
+                       break;
+               case 't':
+                       test_name = optarg;
+                       break;
+               case 'v':
+                       verbose++;
+                       break;
+               case 'h':
+                       help();
+                       exit(0);
+               default:
+                       usage();
+               }
+       }
+
+       if (verbose > 1)
+               help();
+       for (unichar = 0; unichar != 0x110000; unichar++)
+               unicode_data[unichar].code = unichar;
+       age_init();
+       ccc_init();
+       nfkdi_init();
+       nfkdicf_init();
+       ignore_init();
+       corrections_init();
+       hangul_decompose();
+       nfkdi_decompose();
+       nfkdicf_decompose();
+       utf8_init();
+       trees_init();
+       trees_populate();
+       trees_reduce();
+       trees_verify();
+       /* Prevent "unused function" warning. */
+       (void)lookup(nfkdi_tree, " ");
+       if (verbose > 2)
+               tree_walk(nfkdi_tree);
+       if (verbose > 2)
+               tree_walk(nfkdicf_tree);
+       normalization_test();
+       write_file();
+
+       return 0;
+}
-- 
1.7.12.4

<Prev in Thread] Current Thread [Next in Thread>