blob: 76e9b2af03f3d513b08e9f3dedf2de03b26d810b [file] [log] [blame]
// Copyright 2022 The Dawn & Tint Authors
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef SRC_TINT_UTILS_TEXT_UNICODE_H_
#define SRC_TINT_UTILS_TEXT_UNICODE_H_
#include <cstddef>
#include <cstdint>
#include <string_view>
#include <utility>
namespace tint {
/// CodePoint is a unicode code point.
struct CodePoint {
/// Constructor
inline CodePoint() = default;
/// Constructor
/// @param v the code point value
inline explicit CodePoint(uint32_t v) : value(v) {}
/// @returns the code point value
inline operator uint32_t() const { return value; }
/// Assignment operator
/// @param v the new value for the code point
/// @returns this CodePoint
inline CodePoint& operator=(uint32_t v) {
value = v;
return *this;
}
/// @returns true if this CodePoint is in the XID_Start set.
/// @see https://unicode.org/reports/tr31/
bool IsXIDStart() const;
/// @returns true if this CodePoint is in the XID_Continue set.
/// @see https://unicode.org/reports/tr31/
bool IsXIDContinue() const;
/// The code point value
uint32_t value = 0;
};
namespace utf8 {
/// Decodes the first code point in the utf8 string.
/// @param ptr the pointer to the first byte of the utf8 sequence
/// @param len the maximum number of uint8_t to read
/// @returns a pair of CodePoint and width in code units (uint8_t).
/// If the next code point cannot be decoded then returns [0,0].
std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len);
/// Decodes the first code point in the utf8 string.
/// @param utf8_string the string view that contains the utf8 sequence
/// @returns a pair of CodePoint and width in code units (uint8_t).
/// If the next code point cannot be decoded, then returns [0,0].
std::pair<CodePoint, size_t> Decode(std::string_view utf8_string);
/// Encodes a code point to the utf8 string buffer or queries the number of code units used to
/// encode the code point.
/// @param code_point the code point to encode.
/// @param ptr the pointer to the utf8 string buffer, or nullptr to query the number of code units
/// that would be written if @p ptr is not nullptr.
/// @returns the number of code units written / would be written (at most 4).
size_t Encode(CodePoint code_point, uint8_t* ptr);
/// @returns true if all the utf-8 code points in the string are ASCII
/// (code-points 0x00..0x7f).
bool IsASCII(std::string_view);
} // namespace utf8
namespace utf16 {
/// Decodes the first code point in the utf16 string.
/// @param ptr the pointer to the first byte of the utf16 sequence
/// @param len the maximum number of code units to read
/// @returns a pair of CodePoint and width in code units (16-bit integers).
/// If the next code point cannot be decoded then returns [0,0].
std::pair<CodePoint, size_t> Decode(const uint16_t* ptr, size_t len);
/// Decodes the first code point in the utf16 string.
/// @param utf16_string the string view that contains the utf16 sequence
/// @returns a pair of CodePoint and width in code units (16-bit integers).
/// If the next code point cannot be decoded then returns [0,0].
std::pair<CodePoint, size_t> Decode(std::string_view utf16_string);
/// Encodes a code point to the utf16 string buffer or queries the number of code units used to
/// encode the code point.
/// @param code_point the code point to encode.
/// @param ptr the pointer to the utf16 string buffer, or nullptr to query the number of code units
/// that would be written if @p ptr is not nullptr.
/// @returns the number of code units written / would be written (at most 2).
size_t Encode(CodePoint code_point, uint16_t* ptr);
} // namespace utf16
} // namespace tint
#endif // SRC_TINT_UTILS_TEXT_UNICODE_H_