| Austin Eng | cc2516a | 2023-10-17 20:57:54 +0000 | [diff] [blame] | 1 | // Copyright 2022 The Dawn & Tint Authors |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 2 | // |
| Austin Eng | cc2516a | 2023-10-17 20:57:54 +0000 | [diff] [blame] | 3 | // Redistribution and use in source and binary forms, with or without |
| 4 | // modification, are permitted provided that the following conditions are met: |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 5 | // |
| Austin Eng | cc2516a | 2023-10-17 20:57:54 +0000 | [diff] [blame] | 6 | // 1. Redistributions of source code must retain the above copyright notice, this |
| 7 | // list of conditions and the following disclaimer. |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 8 | // |
| Austin Eng | cc2516a | 2023-10-17 20:57:54 +0000 | [diff] [blame] | 9 | // 2. Redistributions in binary form must reproduce the above copyright notice, |
| 10 | // this list of conditions and the following disclaimer in the documentation |
| 11 | // and/or other materials provided with the distribution. |
| 12 | // |
| 13 | // 3. Neither the name of the copyright holder nor the names of its |
| 14 | // contributors may be used to endorse or promote products derived from |
| 15 | // this software without specific prior written permission. |
| 16 | // |
| 17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 18 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 19 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| 20 | // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE |
| 21 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 22 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| 23 | // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| 24 | // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 25 | // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 27 | |
| dan sinclair | 22b4dd2 | 2023-07-21 00:40:07 +0000 | [diff] [blame] | 28 | #ifndef SRC_TINT_UTILS_TEXT_UNICODE_H_ |
| 29 | #define SRC_TINT_UTILS_TEXT_UNICODE_H_ |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 30 | |
| 31 | #include <cstddef> |
| 32 | #include <cstdint> |
| dan sinclair | 6cc183c | 2023-03-02 21:28:45 +0000 | [diff] [blame] | 33 | #include <string_view> |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 34 | #include <utility> |
| 35 | |
| dan sinclair | bae54e7 | 2023-07-28 15:01:54 +0000 | [diff] [blame] | 36 | namespace tint { |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 37 | |
| 38 | /// CodePoint is a unicode code point. |
| 39 | struct CodePoint { |
| dan sinclair | 41e4d9a | 2022-05-01 14:40:55 +0000 | [diff] [blame] | 40 | /// Constructor |
| 41 | inline CodePoint() = default; |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 42 | |
| dan sinclair | 41e4d9a | 2022-05-01 14:40:55 +0000 | [diff] [blame] | 43 | /// Constructor |
| 44 | /// @param v the code point value |
| 45 | inline explicit CodePoint(uint32_t v) : value(v) {} |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 46 | |
| dan sinclair | 41e4d9a | 2022-05-01 14:40:55 +0000 | [diff] [blame] | 47 | /// @returns the code point value |
| 48 | inline operator uint32_t() const { return value; } |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 49 | |
| dan sinclair | 41e4d9a | 2022-05-01 14:40:55 +0000 | [diff] [blame] | 50 | /// Assignment operator |
| 51 | /// @param v the new value for the code point |
| 52 | /// @returns this CodePoint |
| 53 | inline CodePoint& operator=(uint32_t v) { |
| 54 | value = v; |
| 55 | return *this; |
| 56 | } |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 57 | |
| dan sinclair | 41e4d9a | 2022-05-01 14:40:55 +0000 | [diff] [blame] | 58 | /// @returns true if this CodePoint is in the XID_Start set. |
| 59 | /// @see https://unicode.org/reports/tr31/ |
| 60 | bool IsXIDStart() const; |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 61 | |
| dan sinclair | 41e4d9a | 2022-05-01 14:40:55 +0000 | [diff] [blame] | 62 | /// @returns true if this CodePoint is in the XID_Continue set. |
| 63 | /// @see https://unicode.org/reports/tr31/ |
| 64 | bool IsXIDContinue() const; |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 65 | |
| dan sinclair | 41e4d9a | 2022-05-01 14:40:55 +0000 | [diff] [blame] | 66 | /// The code point value |
| 67 | uint32_t value = 0; |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 68 | }; |
| 69 | |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 70 | namespace utf8 { |
| 71 | |
| 72 | /// Decodes the first code point in the utf8 string. |
| 73 | /// @param ptr the pointer to the first byte of the utf8 sequence |
| 74 | /// @param len the maximum number of bytes to read |
| 75 | /// @returns a pair of CodePoint and width in code units (bytes). |
| 76 | /// If the next code point cannot be decoded then returns [0,0]. |
| 77 | std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len); |
| 78 | |
| Jiawei Shao | f7beb85 | 2023-01-10 00:03:24 +0000 | [diff] [blame] | 79 | /// Decodes the first code point in the utf8 string. |
| 80 | /// @param utf8_string the string view that contains the utf8 sequence |
| 81 | /// @returns a pair of CodePoint and width in code units (bytes). |
| 82 | /// If the next code point cannot be decoded then returns [0,0]. |
| 83 | std::pair<CodePoint, size_t> Decode(std::string_view utf8_string); |
| 84 | |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 85 | /// @returns true if all the utf-8 code points in the string are ASCII |
| 86 | /// (code-points 0x00..0x7f). |
| 87 | bool IsASCII(std::string_view); |
| 88 | |
| 89 | } // namespace utf8 |
| 90 | |
| dan sinclair | bae54e7 | 2023-07-28 15:01:54 +0000 | [diff] [blame] | 91 | } // namespace tint |
| Ryan Harrison | dbc13af | 2022-02-21 15:19:07 +0000 | [diff] [blame] | 92 | |
| dan sinclair | 22b4dd2 | 2023-07-21 00:40:07 +0000 | [diff] [blame] | 93 | #endif // SRC_TINT_UTILS_TEXT_UNICODE_H_ |