LeetCode/393.utf-8-validation.cpp at master · samba9274/LeetCode · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/*
 * @lc app=leetcode id=393 lang=cpp
 *
 * [393] UTF-8 Validation
 *
 * https://leetcode.com/problems/utf-8-validation/description/
 *
 * algorithms
 * Medium (39.38%)
 * Likes:    475
 * Dislikes: 1762
 * Total Accepted:    76.6K
 * Total Submissions: 189.2K
 * Testcase Example:  '[197,130,1]'
 *
 * Given an integer array data representing the data, return whether it is a
 * valid UTF-8 encoding (i.e. it translates to a sequence of valid UTF-8
 * encoded characters).
 *
 * A character in UTF8 can be from 1 to 4 bytes long, subjected to the
 * following rules:
 *
 *
 * For a 1-byte character, the first bit is a 0, followed by its Unicode
 * code.
 * For an n-bytes character, the first n bits are all one's, the n + 1 bit is
 * 0, followed by n - 1 bytes with the most significant 2 bits being 10.
 *
 *
 * This is how the UTF-8 encoding would work:
 *
 *
 * ⁠    Number of Bytes   |        UTF-8 Octet Sequence
 * ⁠                      |              (binary)
 * ⁠  --------------------+-----------------------------------------
 * ⁠           1          |   0xxxxxxx
 * ⁠           2          |   110xxxxx 10xxxxxx
 * ⁠           3          |   1110xxxx 10xxxxxx 10xxxxxx
 * ⁠           4          |   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 *
 *
 * x denotes a bit in the binary form of a byte that may be either 0 or 1.
 *
 * Note: The input is an array of integers. Only the least significant 8 bits
 * of each integer is used to store the data. This means each integer
 * represents only 1 byte of data.
 *
 *
 * Example 1:
 *
 *
 * Input: data = [197,130,1]
 * Output: true
 * Explanation: data represents the octet sequence: 11000101 10000010 00000001.
 * It is a valid utf-8 encoding for a 2-bytes character followed by a 1-byte
 * character.
 *
 *
 * Example 2:
 *
 *
 * Input: data = [235,140,4]
 * Output: false
 * Explanation: data represented the octet sequence: 11101011 10001100
 * 00000100.
 * The first 3 bits are all one's and the 4th bit is 0 means it is a 3-bytes
 * character.
 * The next byte is a continuation byte which starts with 10 and that's
 * correct.
 * But the second continuation byte does not start with 10, so it is
 * invalid.
 *
 *
 *
 * Constraints:
 *
 *
 * 1 <= data.length <= 2 * 10^4
 * 0 <= data[i] <= 255
 *
 *
 */

// @lc code=start
class Solution
{
public:
    bool validUtf8(vector<int> &data)
    {
        int count = 0;
        for (int c : data)
        {
            if (count == 0)
            {
                if ((c >> 5) == 0b110)
                    count = 1;
                else if ((c >> 4) == 0b1110)
                    count = 2;
                else if ((c >> 3) == 0b11110)
                    count = 3;
                else if ((c >> 7))
                    return false;
            }
            else
            {
                if ((c >> 6) != 0b10)
                    return false;
                count--;
            }
        }
        return count == 0;
    }
};
// @lc code=end