#define UNI_REPLACEMENT 0xfffcUnicode value used as a default replacement of invalid characters.
static inline byte *utf8_put(byte *p, uint u);Encode a value from the range [0, 0xFFFF]
(basic multilingual plane); up to 3 bytes needed (RFC2279).
static inline byte *utf8_32_put(byte *p, uint u);Encode a value from the range [0, 0x7FFFFFFF];
(superset of Unicode 4.0) up to 6 bytes needed (RFC2279).
static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl);Decode a value from the range [0, 0xFFFF] (basic multilingual plane)
or return repl if the encoding has been corrupted.
static inline byte *utf8_32_get_repl(const byte *p, uint *uu, uint repl);Decode a value from the range [0, 0x7FFFFFFF]
or return repl if the encoding has been corrupted.
static inline byte *utf8_get(const byte *p, uint *uu);Decode a value from the range [0, 0xFFFF] (basic multilingual plane)
or return UNI_REPLACEMENT if the encoding has been corrupted.
static inline byte *utf8_32_get(const byte *p, uint *uu);Decode a value from the range [0, 0x7FFFFFFF]
or return UNI_REPLACEMENT if the encoding has been corrupted.
static inline uint utf8_space(uint u);Return the number of bytes needed to encode a given value from the range [0, 0x7FFFFFFF] to UTF-8.
static inline uint utf8_encoding_len(uint c);Compute the length of a single UTF-8 character from its first byte. The encoding must be valid.
#define UTF8_MAX_LEN 6Maximum number of bytes an UTF-8 character can have.
static inline void *utf16_le_put(void *p, uint u);Encode an UTF-16LE character from the range [0, 0xD7FF] or [0xE000,0x11FFFF];
up to 4 bytes needed.
static inline void *utf16_be_put(void *p, uint u);Encode a UTF-16BE character from the range [0, 0xD7FF] or [0xE000,0x11FFFF];
up to 4 bytes needed.
static inline void *utf16_le_get_repl(const void *p, uint *uu, uint repl);Decode a UTF-16LE character from the range [0, 0xD7FF] or [0xE000,11FFFF]
or return repl if the encoding has been corrupted.
static inline void *utf16_be_get_repl(const void *p, uint *uu, uint repl);Decode a UTF-16BE character from the range [0, 0xD7FF] or [0xE000,11FFFF]
or return repl if the encoding has been corrupted.
static inline void *utf16_le_get(const void *p, uint *uu);Decode a UTF-16LE  character from the range [0, 0xD7FF] or [0xE000,11FFFF]
or return UNI_REPLACEMENT if the encoding has been corrupted.
static inline void *utf16_be_get(const void *p, uint *uu);Decode a UTF-16BE  character from the range [0, 0xD7FF] or [0xE000,11FFFF]
or return UNI_REPLACEMENT if the encoding has been corrupted.
static inline uint unicode_sanitize_char(uint u);Basic sanity check on Unicode characters. Return UNI_REPLACEMENT if the input
character is a surrogate, ASCII or Latin-1 control character different from the tab,
or if it lies outside the basic plane. In all other cases, it acts as an identity.
size_t utf8_strlen(const byte *str);Count the number of Unicode characters in a zero-terminated UTF-8 string. Returned value for corrupted encoding is undefined, but is never greater than strlen().
size_t utf8_strnlen(const byte *str, size_t n);Same as utf8_strlen(), but returns at most n characters.