#include "ws_symbol_export.h"

Functions
WS_DLL_PUBLIC uint8_t *	get_ascii_string (wmem_allocator_t scope, const uint8_t ptr, size_t length)
	Convert an ASCII byte sequence to a UTF‑8 string using a wmem scope.

WS_DLL_PUBLIC uint8_t *	get_utf_8_string (wmem_allocator_t scope, const uint8_t ptr, size_t length)
	Validate and normalize a UTF‑8 byte sequence, replacing invalid sequences with the Unicode REPLACEMENT CHARACTER.

WS_DLL_PUBLIC uint8_t *	get_iso_646_string (wmem_allocator_t scope, const uint8_t ptr, size_t length, const gunichar2 table[0x80])
	Convert a string encoded in an ISO 646-based character set to UTF‑8.

WS_DLL_PUBLIC uint8_t *	get_8859_1_string (wmem_allocator_t scope, const uint8_t ptr, size_t length)
	Convert an ISO 8859/1 string to UTF‑8.

WS_DLL_PUBLIC uint8_t *	get_unichar2_string (wmem_allocator_t scope, const uint8_t ptr, size_t length, const gunichar2 table[0x80])

WS_DLL_PUBLIC uint8_t *	get_ucs_2_string (wmem_allocator_t scope, const uint8_t ptr, size_t length, unsigned encoding)
	Convert a UCS‑2 encoded string to UTF‑8.

WS_DLL_PUBLIC uint8_t *	get_utf_16_string (wmem_allocator_t scope, const uint8_t ptr, size_t length, unsigned encoding)
	Convert a UTF‑16 encoded string to UTF‑8.

WS_DLL_PUBLIC uint8_t *	get_ucs_4_string (wmem_allocator_t scope, const uint8_t ptr, size_t length, unsigned encoding)
	Convert a UCS‑4 encoded string to UTF‑8.

WS_DLL_PUBLIC uint8_t *	get_ts_23_038_7bits_string_packed (wmem_allocator_t scope, const uint8_t ptr, const size_t bit_offset, size_t no_of_chars)
	Extracts a 7-bit encoded string from packed data.

WS_DLL_PUBLIC uint8_t *	get_ts_23_038_7bits_string_unpacked (wmem_allocator_t scope, const uint8_t ptr, size_t length)
	Extracts a 7-bit string from TS 23.038 data.

WS_DLL_PUBLIC uint8_t *	get_etsi_ts_102_221_annex_a_string (wmem_allocator_t scope, const uint8_t ptr, size_t length)
	Retrieves a string based on ETSI TS 102 221 Annex A encoding.

WS_DLL_PUBLIC uint8_t *	get_ascii_7bits_string (wmem_allocator_t scope, const uint8_t ptr, const size_t bit_offset, size_t no_of_chars)
	Convert a sequence of 7-bit ASCII characters to a Unicode string.

WS_DLL_PUBLIC uint8_t *	get_nonascii_unichar2_string (wmem_allocator_t scope, const uint8_t ptr, size_t length, const gunichar2 table[256])
	Convert a GB18030 encoded string to a UTF-8 string, substituting REPLACEMENT CHARACTER for non-ASCII characters.

WS_DLL_PUBLIC uint8_t *	get_gb18030_string (wmem_allocator_t scope, const uint8_t ptr, size_t length)
	Convert a GB18030 encoded string to UTF-8.

WS_DLL_PUBLIC uint8_t *	get_euc_kr_string (wmem_allocator_t scope, const uint8_t ptr, size_t length)
	Convert a EUC-KR encoded string to UTF-8.

WS_DLL_PUBLIC uint8_t *	get_t61_string (wmem_allocator_t scope, const uint8_t ptr, size_t length)
	Converts a T.61 encoded string to a UTF-8 string.

WS_DLL_PUBLIC uint8_t *	get_dect_standard_8bits_string (wmem_allocator_t scope, const uint8_t ptr, size_t length)
	Converts a DECT standard 8-bit string to a Unicode string.

Variables
const gunichar2	charset_table_cp1250 [0x80]

const gunichar2	charset_table_cp1251 [0x80]

const gunichar2	charset_table_cp1252 [0x80]

const gunichar2	charset_table_iso_8859_2 [0x80]

const gunichar2	charset_table_iso_8859_3 [0x80]

const gunichar2	charset_table_iso_8859_4 [0x80]

const gunichar2	charset_table_iso_8859_5 [0x80]

const gunichar2	charset_table_iso_8859_6 [0x80]

const gunichar2	charset_table_iso_8859_7 [0x80]

const gunichar2	charset_table_iso_8859_8 [0x80]

const gunichar2	charset_table_iso_8859_9 [0x80]

const gunichar2	charset_table_iso_8859_10 [0x80]

const gunichar2	charset_table_iso_8859_11 [0x80]

const gunichar2	charset_table_iso_8859_13 [0x80]

const gunichar2	charset_table_iso_8859_14 [0x80]

const gunichar2	charset_table_iso_8859_15 [0x80]

const gunichar2	charset_table_iso_8859_16 [0x80]

const gunichar2	charset_table_mac_roman [0x80]

const gunichar2	charset_table_cp437 [0x80]

const gunichar2	charset_table_cp855 [0x80]

const gunichar2	charset_table_cp866 [0x80]

const gunichar2	charset_table_iso_646_basic [0x80]

const gunichar2	charset_table_ebcdic [256]

const gunichar2	charset_table_ebcdic_cp037 [256]

const gunichar2	charset_table_ebcdic_cp500 [256]

Detailed Description

Routines for handling character sets

SPDX-License-Identifier: GPL-2.0-or-later

Function Documentation

◆ get_8859_1_string()

WS_DLL_PUBLIC uint8_t * get_8859_1_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length
	)

Convert an ISO 8859/1 string to UTF‑8.

Given a wmem scope, a pointer, and a length, treat the string of bytes referred to by the pointer and length as an ISO 8859/1 string, and return a pointer to a UTF-8 string, allocated using the wmem scope.

Parameters

scope	The wmem allocator scope used for the returned UTF‑8 string.
ptr	Pointer to the ISO 8859/1 sequence.
length	Number of bytes to process from the sequence.

Returns: A UTF‑8 string allocated in the given scope.

◆ get_ascii_7bits_string()

WS_DLL_PUBLIC uint8_t * get_ascii_7bits_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		const size_t	bit_offset,
		size_t	no_of_chars
	)

Convert a sequence of 7-bit ASCII characters to a Unicode string.

Parameters

scope	Memory allocation scope for the resulting string buffer.
ptr	Pointer to the input byte array.
bit_offset	Bit offset within the first byte of the input data.
no_of_chars	Number of characters to convert from the input data.

Returns: A new string buffer containing the converted Unicode string.

◆ get_ascii_string()

WS_DLL_PUBLIC uint8_t * get_ascii_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length
	)

Convert an ASCII byte sequence to a UTF‑8 string using a wmem scope.

Given a wmem scope, a pointer, and a length, treat the referenced bytes as an ASCII string, with any byte having the high‑order bit set considered invalid, and return a UTF‑8 string allocated using the wmem scope.

Octets with the high‑order bit set are converted to the Unicode REPLACEMENT CHARACTER.

Parameters

scope	The wmem allocator scope used for the returned UTF‑8 string.
ptr	Pointer to the ASCII byte sequence.
length	Number of bytes to process from the sequence.

Returns: A UTF‑8 string allocated in the given scope.

◆ get_dect_standard_8bits_string()

WS_DLL_PUBLIC uint8_t * get_dect_standard_8bits_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length
	)

Converts a DECT standard 8-bit string to a Unicode string.

Parameters

scope	Memory allocation scope for the resulting string buffer.
ptr	Pointer to the input 8-bit string data.
length	Length of the input string data in bytes.

Returns: A pointer to the converted Unicode string, or NULL on failure.

◆ get_etsi_ts_102_221_annex_a_string()

WS_DLL_PUBLIC uint8_t * get_etsi_ts_102_221_annex_a_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length
	)

Retrieves a string based on ETSI TS 102 221 Annex A encoding.

Parameters

scope	Memory allocation scope for the resulting string.
ptr	Pointer to the input data.
length	Length of the input data.

Returns: Pointer to the allocated string, or an empty string if input is invalid.

◆ get_euc_kr_string()

WS_DLL_PUBLIC uint8_t * get_euc_kr_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length
	)

Convert a EUC-KR encoded string to UTF-8.

Given a wmem scope, a pointer, and a length, treat the bytes referred to by the pointer and length as a EUC-KR encoded string, and return a pointer to a UTF-8 string, allocated using the wmem scope, converted having substituted REPLACEMENT CHARACTER according to the Unicode Standard 5.22 U+FFFD Substitution for Conversion. ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )

Parameters

scope	Memory allocation scope for the returned string.
ptr	Pointer to the input EUC-KR encoded string.
length	Length of the input string.

Returns: uint8_t* Pointer to the allocated UTF-8 encoded string, or NULL on failure.

◆ get_gb18030_string()

WS_DLL_PUBLIC uint8_t * get_gb18030_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length
	)

Convert a GB18030 encoded string to UTF-8.

Given a wmem scope, a pointer, and a length, treat the bytes referred to by the pointer and length as a GB18030 encoded string, and return a pointer to a UTF-8 string, allocated using the wmem scope, converted having substituted REPLACEMENT CHARACTER according to the Unicode Standard 5.22 U+FFFD Substitution for Conversion. ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )

As expected, this will also decode GBK and GB2312 strings.

Parameters

scope	Memory allocation scope.
ptr	Pointer to the input GB18030 encoded string.
length	Length of the input string.

Returns: A pointer to a UTF-8 encoded string, or NULL on failure.

◆ get_iso_646_string()

WS_DLL_PUBLIC uint8_t * get_iso_646_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length,
		const gunichar2	table[0x80]
	)

Convert a string encoded in an ISO 646-based character set to UTF‑8.

Given a wmem scope, a pointer, a length, and a translation table, treat the string of bytes referred to by the pointer and length as a string encoded using one octet per character, with octets with the high-order bit clear being mapped by the translation table to 2-byte Unicode Basic Multilingual Plane characters (including REPLACEMENT CHARACTER) and octets with the high-order bit set being mapped to REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string, allocated using the wmem scope.

Parameters

scope	The wmem allocator scope used for the returned UTF‑8 string.
ptr	Pointer to the ISO 646 sequence.
length	Number of bytes to process from the sequence.
table	Translation table with 128 entries mapping octets with the high-order bit clear to Unicode code points.

Returns: A UTF‑8 string allocated in the given scope.

◆ get_nonascii_unichar2_string()

WS_DLL_PUBLIC uint8_t * get_nonascii_unichar2_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length,
		const gunichar2	table[256]
	)

Convert a GB18030 encoded string to a UTF-8 string, substituting REPLACEMENT CHARACTER for non-ASCII characters.

Given a wmem scope, a pointer, a length, and a translation table with 256 entries, treat the string of bytes referred to by the pointer and length as a string encoded using one octet per character, with octets being mapped by the translation table to 2-byte Unicode Basic Multilingual Plane characters (including REPLACEMENT CHARACTER), and return a pointer to a UTF-8 string, allocated using the wmem scope.

Parameters

scope	Memory allocation scope for the resulting string.
ptr	Pointer to the input GB18030 encoded string.
length	Length of the input string in bytes.
table	Unicode substitution table for non-ASCII characters.

Returns: A pointer to a UTF-8 encoded string allocated using the provided wmem scope, or NULL on failure.

◆ get_t61_string()

WS_DLL_PUBLIC uint8_t * get_t61_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length
	)

Converts a T.61 encoded string to a UTF-8 string.

Parameters

scope	Memory allocation scope for the resulting string.
ptr	Pointer to the input T.61 encoded data.
length	Length of the input data in bytes.

Returns: uint8_t* Pointer to the newly allocated UTF-8 encoded string, or NULL on failure.

◆ get_ts_23_038_7bits_string_packed()

WS_DLL_PUBLIC uint8_t * get_ts_23_038_7bits_string_packed	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		const size_t	bit_offset,
		size_t	no_of_chars
	)

Extracts a 7-bit encoded string from packed data.

This function extracts a sequence of 7-bit encoded characters from a packed byte array, starting at a specified bit offset and unpacking them into a new buffer.

Parameters

scope	Memory allocator for the returned string.
ptr	Pointer to the packed data.
bit_offset	Bit offset within the first byte where extraction begins.
no_of_chars	Number of characters to extract.

Returns: A pointer to the newly allocated unpacked string, or NULL on failure.

◆ get_ts_23_038_7bits_string_unpacked()

WS_DLL_PUBLIC uint8_t * get_ts_23_038_7bits_string_unpacked	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length
	)

Extracts a 7-bit string from TS 23.038 data.

Parameters

scope	Memory allocation scope for the resulting string.
ptr	Pointer to the input data buffer.
length	Length of the input data buffer.

Returns: Pointer to the extracted 7-bit string.

◆ get_ucs_2_string()

WS_DLL_PUBLIC uint8_t * get_ucs_2_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length,
		unsigned	encoding
	)

Convert a UCS‑2 encoded string to UTF‑8.

Given a wmem scope, a pointer, and a length, treat the string of bytes referred to by the pointer and length as a UCS-2 encoded string containing characters from the Basic Multilingual Plane (plane 0) of Unicode, and return a pointer to a UTF-8 string, allocated with the wmem scope.

Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN, possibly ORed with ENC_BOM.

Specify length in bytes.

Parameters

scope	The wmem allocator scope used for the returned UTF‑8 string.
ptr	Pointer to the UCS-2 sequence.
length	Number of bytes to process from the sequence.

Returns: A UTF‑8 string allocated in the given scope.

◆ get_ucs_4_string()

WS_DLL_PUBLIC uint8_t * get_ucs_4_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length,
		unsigned	encoding
	)

Convert a UCS‑4 encoded string to UTF‑8.

Given a wmem scope, a pointer, and a length, treat the string of bytes referred to by the pointer and length as a UCS-4 encoded string, and return a pointer to a UTF-8 string, allocated with the wmem scope.

Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN, possibly ORed with ENC_BOM.

Specify length in bytes.

Parameters

scope	The wmem allocator scope used for the returned UTF‑8 string.
ptr	Pointer to the UCS-4 sequence.
length	Number of bytes to process from the sequence.

Returns: A UTF‑8 string allocated in the given scope.

◆ get_utf_16_string()

WS_DLL_PUBLIC uint8_t * get_utf_16_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length,
		unsigned	encoding
	)

Convert a UTF‑16 encoded string to UTF‑8.

Given a wmem scope, a pointer, and a length, treat the string of bytes referred to by the pointer and length as a UTF-16 encoded string, and return a pointer to a UTF-8 string, allocated with the wmem scope.

See RFC 2781 section 2.2.

Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN, possibly ORed with ENC_BOM.

Specify length in bytes.

Parameters

scope	The wmem allocator scope used for the returned UTF‑8 string.
ptr	Pointer to the UTF‑16 sequence.
length	Number of bytes to process from the sequence.

Returns: A UTF‑8 string allocated in the given scope.

◆ get_utf_8_string()

WS_DLL_PUBLIC uint8_t * get_utf_8_string	(	wmem_allocator_t *	scope,
		const uint8_t *	ptr,
		size_t	length
	)

Validate and normalize a UTF‑8 byte sequence, replacing invalid sequences with the Unicode REPLACEMENT CHARACTER.

Given a wmem scope, a pointer, and a length, treat the string of bytes referred to by the pointer and length as a UTF-8 string, and return a pointer to a UTF-8 string, allocated using the wmem scope, with all ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER according to the recommended "best practices" given in the Unicode Standard and specified by W3C/WHATWG.

Parameters

scope	The wmem allocator scope used for the returned UTF‑8 string.
ptr	Pointer to the UTF‑8 byte sequence.
length	Number of bytes to process from the sequence.

Returns: A UTF‑8 string allocated in the given scope.

Functions

Variables

Detailed Description

Function Documentation

◆ get_8859_1_string()

◆ get_ascii_7bits_string()

◆ get_ascii_string()

◆ get_dect_standard_8bits_string()

◆ get_etsi_ts_102_221_annex_a_string()

◆ get_euc_kr_string()

◆ get_gb18030_string()

◆ get_iso_646_string()

◆ get_nonascii_unichar2_string()

◆ get_t61_string()

◆ get_ts_23_038_7bits_string_packed()

◆ get_ts_23_038_7bits_string_unpacked()

◆ get_ucs_2_string()

◆ get_ucs_4_string()

◆ get_utf_16_string()

◆ get_utf_8_string()