#define UNI_REPLACEMENT 0xfffc
Unicode value used as a default replacement of invalid characters.
static inline byte *utf8_put(byte *p, uint u);
Encode a value from the range [0, 0xFFFF] (basic multilingual plane); up to 3 bytes needed (RFC2279).
static inline byte *utf8_32_put(byte *p, uint u);
Encode a value from the range [0, 0x7FFFFFFF]; (superset of Unicode 4.0) up to 6 bytes needed (RFC2279).
static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl);
Decode a value from the range [0, 0xFFFF] (basic multilingual plane) or return repl if the encoding has been corrupted.
static inline byte *utf8_32_get_repl(const byte *p, uint *uu, uint repl);
Decode a value from the range [0, 0x7FFFFFFF] or return repl if the encoding has been corrupted.
static inline byte *utf8_get(const byte *p, uint *uu);
Decode a value from the range [0, 0xFFFF] (basic multilingual plane) or return UNI_REPLACEMENT if the encoding has been corrupted.
static inline byte *utf8_32_get(const byte *p, uint *uu);
Decode a value from the range [0, 0x7FFFFFFF] or return UNI_REPLACEMENT if the encoding has been corrupted.
static inline uint utf8_space(uint u);
Return the number of bytes needed to encode a given value from the range [0, 0x7FFFFFFF] to UTF-8.
static inline uint utf8_encoding_len(uint c);
Compute the length of a single UTF-8 character from its first byte. The encoding must be valid.
#define UTF8_MAX_LEN 6
Maximum number of bytes an UTF-8 character can have.
static inline void *utf16_le_put(void *p, uint u);
Encode an UTF-16LE character from the range [0, 0xD7FF] or [0xE000,0x11FFFF]; up to 4 bytes needed.
static inline void *utf16_be_put(void *p, uint u);
Encode a UTF-16BE character from the range [0, 0xD7FF] or [0xE000,0x11FFFF]; up to 4 bytes needed.
static inline void *utf16_le_get_repl(const void *p, uint *uu, uint repl);
Decode a UTF-16LE character from the range [0, 0xD7FF] or [0xE000,11FFFF] or return repl if the encoding has been corrupted.
static inline void *utf16_be_get_repl(const void *p, uint *uu, uint repl);
Decode a UTF-16BE character from the range [0, 0xD7FF] or [0xE000,11FFFF] or return repl if the encoding has been corrupted.
static inline void *utf16_le_get(const void *p, uint *uu);
Decode a UTF-16LE character from the range [0, 0xD7FF] or [0xE000,11FFFF] or return UNI_REPLACEMENT if the encoding has been corrupted.
static inline void *utf16_be_get(const void *p, uint *uu);
Decode a UTF-16BE character from the range [0, 0xD7FF] or [0xE000,11FFFF] or return UNI_REPLACEMENT if the encoding has been corrupted.
static inline uint unicode_sanitize_char(uint u);
Basic sanity check on Unicode characters. Return UNI_REPLACEMENT if the input character is a surrogate, ASCII or Latin-1 control character different from the tab, or if it lies outside the basic plane. In all other cases, it acts as an identity.
size_t utf8_strlen(const byte *str);
Count the number of Unicode characters in a zero-terminated UTF-8 string. Returned value for corrupted encoding is undefined, but is never greater than strlen().
size_t utf8_strnlen(const byte *str, size_t n);
Same as utf8_strlen(), but returns at most n characters.