blob: c9939961e7a78570ea38aef2e9c568216811c9bf [file] [log] [blame]
Austin Engcc2516a2023-10-17 20:57:54 +00001// Copyright 2022 The Dawn & Tint Authors
Ryan Harrisondbc13af2022-02-21 15:19:07 +00002//
Austin Engcc2516a2023-10-17 20:57:54 +00003// Redistribution and use in source and binary forms, with or without
4// modification, are permitted provided that the following conditions are met:
Ryan Harrisondbc13af2022-02-21 15:19:07 +00005//
Austin Engcc2516a2023-10-17 20:57:54 +00006// 1. Redistributions of source code must retain the above copyright notice, this
7// list of conditions and the following disclaimer.
Ryan Harrisondbc13af2022-02-21 15:19:07 +00008//
Austin Engcc2516a2023-10-17 20:57:54 +00009// 2. Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12//
13// 3. Neither the name of the copyright holder nor the names of its
14// contributors may be used to endorse or promote products derived from
15// this software without specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Ryan Harrisondbc13af2022-02-21 15:19:07 +000027
dan sinclair22b4dd22023-07-21 00:40:07 +000028#ifndef SRC_TINT_UTILS_TEXT_UNICODE_H_
29#define SRC_TINT_UTILS_TEXT_UNICODE_H_
Ryan Harrisondbc13af2022-02-21 15:19:07 +000030
31#include <cstddef>
32#include <cstdint>
dan sinclair6cc183c2023-03-02 21:28:45 +000033#include <string_view>
Ryan Harrisondbc13af2022-02-21 15:19:07 +000034#include <utility>
35
dan sinclairbae54e72023-07-28 15:01:54 +000036namespace tint {
Ryan Harrisondbc13af2022-02-21 15:19:07 +000037
38/// CodePoint is a unicode code point.
39struct CodePoint {
dan sinclair41e4d9a2022-05-01 14:40:55 +000040 /// Constructor
41 inline CodePoint() = default;
Ryan Harrisondbc13af2022-02-21 15:19:07 +000042
dan sinclair41e4d9a2022-05-01 14:40:55 +000043 /// Constructor
44 /// @param v the code point value
45 inline explicit CodePoint(uint32_t v) : value(v) {}
Ryan Harrisondbc13af2022-02-21 15:19:07 +000046
dan sinclair41e4d9a2022-05-01 14:40:55 +000047 /// @returns the code point value
48 inline operator uint32_t() const { return value; }
Ryan Harrisondbc13af2022-02-21 15:19:07 +000049
dan sinclair41e4d9a2022-05-01 14:40:55 +000050 /// Assignment operator
51 /// @param v the new value for the code point
52 /// @returns this CodePoint
53 inline CodePoint& operator=(uint32_t v) {
54 value = v;
55 return *this;
56 }
Ryan Harrisondbc13af2022-02-21 15:19:07 +000057
dan sinclair41e4d9a2022-05-01 14:40:55 +000058 /// @returns true if this CodePoint is in the XID_Start set.
59 /// @see https://unicode.org/reports/tr31/
60 bool IsXIDStart() const;
Ryan Harrisondbc13af2022-02-21 15:19:07 +000061
dan sinclair41e4d9a2022-05-01 14:40:55 +000062 /// @returns true if this CodePoint is in the XID_Continue set.
63 /// @see https://unicode.org/reports/tr31/
64 bool IsXIDContinue() const;
Ryan Harrisondbc13af2022-02-21 15:19:07 +000065
dan sinclair41e4d9a2022-05-01 14:40:55 +000066 /// The code point value
67 uint32_t value = 0;
Ryan Harrisondbc13af2022-02-21 15:19:07 +000068};
69
Ryan Harrisondbc13af2022-02-21 15:19:07 +000070namespace utf8 {
71
72/// Decodes the first code point in the utf8 string.
73/// @param ptr the pointer to the first byte of the utf8 sequence
74/// @param len the maximum number of bytes to read
75/// @returns a pair of CodePoint and width in code units (bytes).
76/// If the next code point cannot be decoded then returns [0,0].
77std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len);
78
Jiawei Shaof7beb852023-01-10 00:03:24 +000079/// Decodes the first code point in the utf8 string.
80/// @param utf8_string the string view that contains the utf8 sequence
81/// @returns a pair of CodePoint and width in code units (bytes).
82/// If the next code point cannot be decoded then returns [0,0].
83std::pair<CodePoint, size_t> Decode(std::string_view utf8_string);
84
Ryan Harrisondbc13af2022-02-21 15:19:07 +000085/// @returns true if all the utf-8 code points in the string are ASCII
86/// (code-points 0x00..0x7f).
87bool IsASCII(std::string_view);
88
89} // namespace utf8
90
dan sinclairbae54e72023-07-28 15:01:54 +000091} // namespace tint
Ryan Harrisondbc13af2022-02-21 15:19:07 +000092
dan sinclair22b4dd22023-07-21 00:40:07 +000093#endif // SRC_TINT_UTILS_TEXT_UNICODE_H_