Wireshark 4.7.0
The Wireshark network protocol analyzer
Loading...
Searching...
No Matches
Functions | Variables
charsets.h File Reference
#include "ws_symbol_export.h"

Go to the source code of this file.

Functions

WS_DLL_PUBLIC uint8_t * get_ascii_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length)
 Convert an ASCII byte sequence to a UTF‑8 string using a wmem scope.
 
WS_DLL_PUBLIC uint8_t * get_utf_8_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length)
 Validate and normalize a UTF‑8 byte sequence, replacing invalid sequences with the Unicode REPLACEMENT CHARACTER.
 
WS_DLL_PUBLIC uint8_t * get_iso_646_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length, const gunichar2 table[0x80])
 Convert a string encoded in an ISO 646-based character set to UTF‑8.
 
WS_DLL_PUBLIC uint8_t * get_8859_1_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length)
 Convert an ISO 8859/1 string to UTF‑8.
 
WS_DLL_PUBLIC uint8_t * get_unichar2_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length, const gunichar2 table[0x80])
 
WS_DLL_PUBLIC uint8_t * get_ucs_2_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length, unsigned encoding)
 Convert a UCS‑2 encoded string to UTF‑8.
 
WS_DLL_PUBLIC uint8_t * get_utf_16_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length, unsigned encoding)
 Convert a UTF‑16 encoded string to UTF‑8.
 
WS_DLL_PUBLIC uint8_t * get_ucs_4_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length, unsigned encoding)
 Convert a UCS‑4 encoded string to UTF‑8.
 
WS_DLL_PUBLIC uint8_t * get_ts_23_038_7bits_string_packed (wmem_allocator_t *scope, const uint8_t *ptr, const size_t bit_offset, size_t no_of_chars)
 Extracts a 7-bit encoded string from packed data.
 
WS_DLL_PUBLIC uint8_t * get_ts_23_038_7bits_string_unpacked (wmem_allocator_t *scope, const uint8_t *ptr, size_t length)
 Extracts a 7-bit string from TS 23.038 data.
 
WS_DLL_PUBLIC uint8_t * get_etsi_ts_102_221_annex_a_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length)
 Retrieves a string based on ETSI TS 102 221 Annex A encoding.
 
WS_DLL_PUBLIC uint8_t * get_ascii_7bits_string (wmem_allocator_t *scope, const uint8_t *ptr, const size_t bit_offset, size_t no_of_chars)
 Convert a sequence of 7-bit ASCII characters to a Unicode string.
 
WS_DLL_PUBLIC uint8_t * get_nonascii_unichar2_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length, const gunichar2 table[256])
 Convert a GB18030 encoded string to a UTF-8 string, substituting REPLACEMENT CHARACTER for non-ASCII characters.
 
WS_DLL_PUBLIC uint8_t * get_gb18030_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length)
 Convert a GB18030 encoded string to UTF-8.
 
WS_DLL_PUBLIC uint8_t * get_euc_kr_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length)
 Convert a EUC-KR encoded string to UTF-8.
 
WS_DLL_PUBLIC uint8_t * get_t61_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length)
 Converts a T.61 encoded string to a UTF-8 string.
 
WS_DLL_PUBLIC uint8_t * get_dect_standard_8bits_string (wmem_allocator_t *scope, const uint8_t *ptr, size_t length)
 Converts a DECT standard 8-bit string to a Unicode string.
 

Variables

const gunichar2 charset_table_cp1250 [0x80]
 
const gunichar2 charset_table_cp1251 [0x80]
 
const gunichar2 charset_table_cp1252 [0x80]
 
const gunichar2 charset_table_iso_8859_2 [0x80]
 
const gunichar2 charset_table_iso_8859_3 [0x80]
 
const gunichar2 charset_table_iso_8859_4 [0x80]
 
const gunichar2 charset_table_iso_8859_5 [0x80]
 
const gunichar2 charset_table_iso_8859_6 [0x80]
 
const gunichar2 charset_table_iso_8859_7 [0x80]
 
const gunichar2 charset_table_iso_8859_8 [0x80]
 
const gunichar2 charset_table_iso_8859_9 [0x80]
 
const gunichar2 charset_table_iso_8859_10 [0x80]
 
const gunichar2 charset_table_iso_8859_11 [0x80]
 
const gunichar2 charset_table_iso_8859_13 [0x80]
 
const gunichar2 charset_table_iso_8859_14 [0x80]
 
const gunichar2 charset_table_iso_8859_15 [0x80]
 
const gunichar2 charset_table_iso_8859_16 [0x80]
 
const gunichar2 charset_table_mac_roman [0x80]
 
const gunichar2 charset_table_cp437 [0x80]
 
const gunichar2 charset_table_cp855 [0x80]
 
const gunichar2 charset_table_cp866 [0x80]
 
const gunichar2 charset_table_iso_646_basic [0x80]
 
const gunichar2 charset_table_ebcdic [256]
 
const gunichar2 charset_table_ebcdic_cp037 [256]
 
const gunichar2 charset_table_ebcdic_cp500 [256]
 

Detailed Description

Routines for handling character sets

Wireshark - Network traffic analyzer By Gerald Combs geral.nosp@m.d@wi.nosp@m.resha.nosp@m.rk.o.nosp@m.rg Copyright 1998 Gerald Combs

SPDX-License-Identifier: GPL-2.0-or-later

Function Documentation

◆ get_8859_1_string()

WS_DLL_PUBLIC uint8_t * get_8859_1_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length 
)

Convert an ISO 8859/1 string to UTF‑8.

Given a wmem scope, a pointer, and a length, treat the string of bytes referred to by the pointer and length as an ISO 8859/1 string, and return a pointer to a UTF-8 string, allocated using the wmem scope.

Parameters
scopeThe wmem allocator scope used for the returned UTF‑8 string.
ptrPointer to the ISO 8859/1 sequence.
lengthNumber of bytes to process from the sequence.
Returns
A UTF‑8 string allocated in the given scope.

◆ get_ascii_7bits_string()

WS_DLL_PUBLIC uint8_t * get_ascii_7bits_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
const size_t  bit_offset,
size_t  no_of_chars 
)

Convert a sequence of 7-bit ASCII characters to a Unicode string.

Parameters
scopeMemory allocation scope for the resulting string buffer.
ptrPointer to the input byte array.
bit_offsetBit offset within the first byte of the input data.
no_of_charsNumber of characters to convert from the input data.
Returns
A new string buffer containing the converted Unicode string.

◆ get_ascii_string()

WS_DLL_PUBLIC uint8_t * get_ascii_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length 
)

Convert an ASCII byte sequence to a UTF‑8 string using a wmem scope.

Given a wmem scope, a pointer, and a length, treat the referenced bytes as an ASCII string, with any byte having the high‑order bit set considered invalid, and return a UTF‑8 string allocated using the wmem scope.

Octets with the high‑order bit set are converted to the Unicode REPLACEMENT CHARACTER.

Parameters
scopeThe wmem allocator scope used for the returned UTF‑8 string.
ptrPointer to the ASCII byte sequence.
lengthNumber of bytes to process from the sequence.
Returns
A UTF‑8 string allocated in the given scope.

◆ get_dect_standard_8bits_string()

WS_DLL_PUBLIC uint8_t * get_dect_standard_8bits_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length 
)

Converts a DECT standard 8-bit string to a Unicode string.

Parameters
scopeMemory allocation scope for the resulting string buffer.
ptrPointer to the input 8-bit string data.
lengthLength of the input string data in bytes.
Returns
A pointer to the converted Unicode string, or NULL on failure.

◆ get_etsi_ts_102_221_annex_a_string()

WS_DLL_PUBLIC uint8_t * get_etsi_ts_102_221_annex_a_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length 
)

Retrieves a string based on ETSI TS 102 221 Annex A encoding.

Parameters
scopeMemory allocation scope for the resulting string.
ptrPointer to the input data.
lengthLength of the input data.
Returns
Pointer to the allocated string, or an empty string if input is invalid.

◆ get_euc_kr_string()

WS_DLL_PUBLIC uint8_t * get_euc_kr_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length 
)

Convert a EUC-KR encoded string to UTF-8.

Given a wmem scope, a pointer, and a length, treat the bytes referred to by the pointer and length as a EUC-KR encoded string, and return a pointer to a UTF-8 string, allocated using the wmem scope, converted having substituted REPLACEMENT CHARACTER according to the Unicode Standard 5.22 U+FFFD Substitution for Conversion. ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )

Parameters
scopeMemory allocation scope for the returned string.
ptrPointer to the input EUC-KR encoded string.
lengthLength of the input string.
Returns
uint8_t* Pointer to the allocated UTF-8 encoded string, or NULL on failure.

◆ get_gb18030_string()

WS_DLL_PUBLIC uint8_t * get_gb18030_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length 
)

Convert a GB18030 encoded string to UTF-8.

Given a wmem scope, a pointer, and a length, treat the bytes referred to by the pointer and length as a GB18030 encoded string, and return a pointer to a UTF-8 string, allocated using the wmem scope, converted having substituted REPLACEMENT CHARACTER according to the Unicode Standard 5.22 U+FFFD Substitution for Conversion. ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )

As expected, this will also decode GBK and GB2312 strings.

Parameters
scopeMemory allocation scope.
ptrPointer to the input GB18030 encoded string.
lengthLength of the input string.
Returns
A pointer to a UTF-8 encoded string, or NULL on failure.

◆ get_iso_646_string()

WS_DLL_PUBLIC uint8_t * get_iso_646_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length,
const gunichar2  table[0x80] 
)

Convert a string encoded in an ISO 646-based character set to UTF‑8.

Given a wmem scope, a pointer, a length, and a translation table, treat the string of bytes referred to by the pointer and length as a string encoded using one octet per character, with octets with the high-order bit clear being mapped by the translation table to 2-byte Unicode Basic Multilingual Plane characters (including REPLACEMENT CHARACTER) and octets with the high-order bit set being mapped to REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string, allocated using the wmem scope.

Parameters
scopeThe wmem allocator scope used for the returned UTF‑8 string.
ptrPointer to the ISO 646 sequence.
lengthNumber of bytes to process from the sequence.
tableTranslation table with 128 entries mapping octets with the high-order bit clear to Unicode code points.
Returns
A UTF‑8 string allocated in the given scope.

◆ get_nonascii_unichar2_string()

WS_DLL_PUBLIC uint8_t * get_nonascii_unichar2_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length,
const gunichar2  table[256] 
)

Convert a GB18030 encoded string to a UTF-8 string, substituting REPLACEMENT CHARACTER for non-ASCII characters.

Given a wmem scope, a pointer, a length, and a translation table with 256 entries, treat the string of bytes referred to by the pointer and length as a string encoded using one octet per character, with octets being mapped by the translation table to 2-byte Unicode Basic Multilingual Plane characters (including REPLACEMENT CHARACTER), and return a pointer to a UTF-8 string, allocated using the wmem scope.

Parameters
scopeMemory allocation scope for the resulting string.
ptrPointer to the input GB18030 encoded string.
lengthLength of the input string in bytes.
tableUnicode substitution table for non-ASCII characters.
Returns
A pointer to a UTF-8 encoded string allocated using the provided wmem scope, or NULL on failure.

◆ get_t61_string()

WS_DLL_PUBLIC uint8_t * get_t61_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length 
)

Converts a T.61 encoded string to a UTF-8 string.

Parameters
scopeMemory allocation scope for the resulting string.
ptrPointer to the input T.61 encoded data.
lengthLength of the input data in bytes.
Returns
uint8_t* Pointer to the newly allocated UTF-8 encoded string, or NULL on failure.

◆ get_ts_23_038_7bits_string_packed()

WS_DLL_PUBLIC uint8_t * get_ts_23_038_7bits_string_packed ( wmem_allocator_t scope,
const uint8_t *  ptr,
const size_t  bit_offset,
size_t  no_of_chars 
)

Extracts a 7-bit encoded string from packed data.

This function extracts a sequence of 7-bit encoded characters from a packed byte array, starting at a specified bit offset and unpacking them into a new buffer.

Parameters
scopeMemory allocator for the returned string.
ptrPointer to the packed data.
bit_offsetBit offset within the first byte where extraction begins.
no_of_charsNumber of characters to extract.
Returns
A pointer to the newly allocated unpacked string, or NULL on failure.

◆ get_ts_23_038_7bits_string_unpacked()

WS_DLL_PUBLIC uint8_t * get_ts_23_038_7bits_string_unpacked ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length 
)

Extracts a 7-bit string from TS 23.038 data.

Parameters
scopeMemory allocation scope for the resulting string.
ptrPointer to the input data buffer.
lengthLength of the input data buffer.
Returns
Pointer to the extracted 7-bit string.

◆ get_ucs_2_string()

WS_DLL_PUBLIC uint8_t * get_ucs_2_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length,
unsigned  encoding 
)

Convert a UCS‑2 encoded string to UTF‑8.

Given a wmem scope, a pointer, and a length, treat the string of bytes referred to by the pointer and length as a UCS-2 encoded string containing characters from the Basic Multilingual Plane (plane 0) of Unicode, and return a pointer to a UTF-8 string, allocated with the wmem scope.

Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN, possibly ORed with ENC_BOM.

Specify length in bytes.

Parameters
scopeThe wmem allocator scope used for the returned UTF‑8 string.
ptrPointer to the UCS-2 sequence.
lengthNumber of bytes to process from the sequence.
Returns
A UTF‑8 string allocated in the given scope.

◆ get_ucs_4_string()

WS_DLL_PUBLIC uint8_t * get_ucs_4_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length,
unsigned  encoding 
)

Convert a UCS‑4 encoded string to UTF‑8.

Given a wmem scope, a pointer, and a length, treat the string of bytes referred to by the pointer and length as a UCS-4 encoded string, and return a pointer to a UTF-8 string, allocated with the wmem scope.

Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN, possibly ORed with ENC_BOM.

Specify length in bytes.

Parameters
scopeThe wmem allocator scope used for the returned UTF‑8 string.
ptrPointer to the UCS-4 sequence.
lengthNumber of bytes to process from the sequence.
Returns
A UTF‑8 string allocated in the given scope.

◆ get_utf_16_string()

WS_DLL_PUBLIC uint8_t * get_utf_16_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length,
unsigned  encoding 
)

Convert a UTF‑16 encoded string to UTF‑8.

Given a wmem scope, a pointer, and a length, treat the string of bytes referred to by the pointer and length as a UTF-16 encoded string, and return a pointer to a UTF-8 string, allocated with the wmem scope.

See RFC 2781 section 2.2.

Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN, possibly ORed with ENC_BOM.

Specify length in bytes.

Parameters
scopeThe wmem allocator scope used for the returned UTF‑8 string.
ptrPointer to the UTF‑16 sequence.
lengthNumber of bytes to process from the sequence.
Returns
A UTF‑8 string allocated in the given scope.

◆ get_utf_8_string()

WS_DLL_PUBLIC uint8_t * get_utf_8_string ( wmem_allocator_t scope,
const uint8_t *  ptr,
size_t  length 
)

Validate and normalize a UTF‑8 byte sequence, replacing invalid sequences with the Unicode REPLACEMENT CHARACTER.

Given a wmem scope, a pointer, and a length, treat the string of bytes referred to by the pointer and length as a UTF-8 string, and return a pointer to a UTF-8 string, allocated using the wmem scope, with all ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER according to the recommended "best practices" given in the Unicode Standard and specified by W3C/WHATWG.

Parameters
scopeThe wmem allocator scope used for the returned UTF‑8 string.
ptrPointer to the UTF‑8 byte sequence.
lengthNumber of bytes to process from the sequence.
Returns
A UTF‑8 string allocated in the given scope.