gnunet-android

GNUnet for Android
Log | Files | Refs | README

uninorm.h (10898B)


      1 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
      2 /* Normalization forms (composition and decomposition) of Unicode strings.
      3    Copyright (C) 2001-2002, 2009-2024 Free Software Foundation, Inc.
      4    Written by Bruno Haible <bruno@clisp.org>, 2009.
      5 
      6    This file is free software: you can redistribute it and/or modify
      7    it under the terms of the GNU Lesser General Public License as
      8    published by the Free Software Foundation; either version 2.1 of the
      9    License, or (at your option) any later version.
     10 
     11    This file is distributed in the hope that it will be useful,
     12    but WITHOUT ANY WARRANTY; without even the implied warranty of
     13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14    GNU Lesser General Public License for more details.
     15 
     16    You should have received a copy of the GNU Lesser General Public License
     17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
     18 
     19 #ifndef _UNINORM_H
     20 #define _UNINORM_H
     21 
     22 /* Get common macros for C.  */
     23 #include <unistring/cdefs.h>
     24 
     25 /* Get size_t.  */
     26 #include <stddef.h>
     27 
     28 #include "unitypes.h"
     29 
     30 #if 1
     31 # include <unistring/woe32dll.h>
     32 #else
     33 # define LIBUNISTRING_DLL_VARIABLE
     34 #endif
     35 
     36 
     37 #ifdef __cplusplus
     38 extern "C" {
     39 #endif
     40 
     41 
     42 /* Conventions:
     43 
     44    All functions prefixed with u8_ operate on UTF-8 encoded strings.
     45    Their unit is an uint8_t (1 byte).
     46 
     47    All functions prefixed with u16_ operate on UTF-16 encoded strings.
     48    Their unit is an uint16_t (a 2-byte word).
     49 
     50    All functions prefixed with u32_ operate on UCS-4 encoded strings.
     51    Their unit is an uint32_t (a 4-byte word).
     52 
     53    All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
     54    n units.
     55 
     56    Functions returning a string result take a (resultbuf, lengthp) argument
     57    pair.  If resultbuf is not NULL and the result fits into *lengthp units,
     58    it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
     59    allocated string is returned.  In both cases, *lengthp is set to the
     60    length (number of units) of the returned string.  In case of error,
     61    NULL is returned and errno is set.  */
     62 
     63 
     64 enum
     65 {
     66   UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
     67   UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
     68   UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
     69   UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
     70   UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
     71   UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
     72   UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
     73   UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
     74   UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
     75   UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
     76   UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
     77   UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility character. */
     78   UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility character. */
     79   UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
     80   UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
     81   UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
     82   UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility character. */
     83 };
     84 
     85 /* Maximum size of decomposition of a single Unicode character.  */
     86 #define UC_DECOMPOSITION_MAX_LENGTH 32
     87 
     88 /* Return the character decomposition mapping of a Unicode character.
     89    DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
     90    ucs_t elements.
     91    When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
     92    filled and N is returned.  Otherwise -1 is returned.  */
     93 extern int
     94        uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
     95 
     96 /* Return the canonical character decomposition mapping of a Unicode character.
     97    DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
     98    ucs_t elements.
     99    When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
    100    returned.  Otherwise -1 is returned.  */
    101 extern int
    102        uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
    103 
    104 
    105 /* Attempt to combine the Unicode characters uc1, uc2.
    106    uc1 is known to have canonical combining class 0.
    107    Return the combination of uc1 and uc2, if it exists.
    108    Return 0 otherwise.
    109    Not all decompositions can be recombined using this function.  See the
    110    Unicode file CompositionExclusions.txt for details.  */
    111 extern ucs4_t
    112        uc_composition (ucs4_t uc1, ucs4_t uc2)
    113        _UC_ATTRIBUTE_CONST;
    114 
    115 
    116 /* An object of type uninorm_t denotes a Unicode normalization form.  */
    117 struct unicode_normalization_form;
    118 typedef const struct unicode_normalization_form *uninorm_t;
    119 
    120 /* UNINORM_NFD: Normalization form D: canonical decomposition.  */
    121 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfd;
    122 #define UNINORM_NFD (&uninorm_nfd)
    123 
    124 /* UNINORM_NFC: Normalization form C: canonical decomposition, then
    125    canonical composition.  */
    126 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfc;
    127 #define UNINORM_NFC (&uninorm_nfc)
    128 
    129 /* UNINORM_NFKD: Normalization form KD: compatibility decomposition.  */
    130 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkd;
    131 #define UNINORM_NFKD (&uninorm_nfkd)
    132 
    133 /* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
    134    canonical composition.  */
    135 extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkc;
    136 #define UNINORM_NFKC (&uninorm_nfkc)
    137 
    138 /* Test whether a normalization form does compatibility decomposition.  */
    139 #define uninorm_is_compat_decomposing(nf) \
    140   ((* (const unsigned int *) (nf) >> 0) & 1)
    141 
    142 /* Test whether a normalization form includes canonical composition.  */
    143 #define uninorm_is_composing(nf) \
    144   ((* (const unsigned int *) (nf) >> 1) & 1)
    145 
    146 /* Return the decomposing variant of a normalization form.
    147    This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD.  */
    148 extern uninorm_t
    149        uninorm_decomposing_form (uninorm_t nf)
    150        _UC_ATTRIBUTE_PURE;
    151 
    152 
    153 /* Return the specified normalization form of a string.  */
    154 extern uint8_t *
    155        u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
    156                      uint8_t *_UC_RESTRICT resultbuf, size_t *lengthp);
    157 extern uint16_t *
    158        u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
    159                       uint16_t *_UC_RESTRICT resultbuf, size_t *lengthp);
    160 extern uint32_t *
    161        u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
    162                       uint32_t *_UC_RESTRICT resultbuf, size_t *lengthp);
    163 
    164 
    165 /* Compare S1 and S2, ignoring differences in normalization.
    166    NF must be either UNINORM_NFD or UNINORM_NFKD.
    167    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
    168    return 0.  Upon failure, return -1 with errno set.  */
    169 extern int
    170        u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
    171                    uninorm_t nf, int *resultp);
    172 extern int
    173        u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
    174                     uninorm_t nf, int *resultp);
    175 extern int
    176        u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
    177                     uninorm_t nf, int *resultp);
    178 
    179 
    180 /* Converts the string S of length N to a NUL-terminated byte sequence, in such
    181    a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
    182    equivalent to comparing S1 and S2 with uN_normcoll().
    183    NF must be either UNINORM_NFC or UNINORM_NFKC.  */
    184 extern char *
    185        u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
    186                     char *resultbuf, size_t *lengthp);
    187 extern char *
    188        u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
    189                      char *resultbuf, size_t *lengthp);
    190 extern char *
    191        u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
    192                      char *resultbuf, size_t *lengthp);
    193 
    194 
    195 /* Compare S1 and S2, ignoring differences in normalization, using the
    196    collation rules of the current locale.
    197    NF must be either UNINORM_NFC or UNINORM_NFKC.
    198    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
    199    return 0.  Upon failure, return -1 with errno set.  */
    200 extern int
    201        u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
    202                     uninorm_t nf, int *resultp);
    203 extern int
    204        u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
    205                      uninorm_t nf, int *resultp);
    206 extern int
    207        u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
    208                      uninorm_t nf, int *resultp);
    209 
    210 
    211 /* Normalization of a stream of Unicode characters.
    212 
    213    A "stream of Unicode characters" is essentially a function that accepts an
    214    ucs4_t argument repeatedly, optionally combined with a function that
    215    "flushes" the stream.  */
    216 
    217 /* Data type of a stream of Unicode characters that normalizes its input
    218    according to a given normalization form and passes the normalized character
    219    sequence to the encapsulated stream of Unicode characters.  */
    220 struct uninorm_filter;
    221 
    222 /* Bring data buffered in the filter to its destination, the encapsulated
    223    stream, then close and free the filter.
    224    Return 0 if successful, or -1 with errno set upon failure.  */
    225 extern int
    226        uninorm_filter_free (struct uninorm_filter *filter);
    227 
    228 /* Create and return a normalization filter for Unicode characters.
    229    The pair (stream_func, stream_data) is the encapsulated stream.
    230    stream_func (stream_data, uc) receives the Unicode character uc
    231    and returns 0 if successful, or -1 with errno set upon failure.
    232    Return the new filter, or NULL with errno set upon failure.  */
    233 extern struct uninorm_filter *
    234        uninorm_filter_create (uninorm_t nf,
    235                               int (*stream_func) (void *stream_data, ucs4_t uc),
    236                               void *stream_data)
    237        _GL_ATTRIBUTE_DEALLOC (uninorm_filter_free, 1);
    238 
    239 /* Stuff a Unicode character into a normalizing filter.
    240    Return 0 if successful, or -1 with errno set upon failure.  */
    241 extern int
    242        uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);
    243 
    244 /* Bring data buffered in the filter to its destination, the encapsulated
    245    stream.
    246    Return 0 if successful, or -1 with errno set upon failure.
    247    Note! If after calling this function, additional characters are written
    248    into the filter, the resulting character sequence in the encapsulated stream
    249    will not necessarily be normalized.  */
    250 extern int
    251        uninorm_filter_flush (struct uninorm_filter *filter);
    252 
    253 
    254 #ifdef __cplusplus
    255 }
    256 #endif
    257 
    258 
    259 #endif /* _UNINORM_H */